From 83affd526f70cadb403bfdb09b74a354fc6926cc Mon Sep 17 00:00:00 2001 From: Gary Helmling Date: Tue, 25 Jun 2013 19:38:21 -0500 Subject: [PATCH] Initial commit Signed-off-by: Chris Aniszczyk --- .gitignore | 10 + .settings/org.eclipse.core.resources.prefs | 7 + .settings/org.eclipse.jdt.core.prefs | 292 +++ .settings/org.eclipse.jdt.ui.prefs | 4 + .settings/org.eclipse.m2e.core.prefs | 5 + .travis.yml | 3 + CONTRIBUTING.md | 70 + LICENSE | 202 ++ README.md | 179 ++ bin/README | 73 + bin/create_schema.rb | 50 + bin/encode_job_id.rb | 45 + bin/etl/hraven-etl.sh | 74 + bin/etl/jobFileLoader.sh | 44 + bin/etl/jobFilePreprocessor.sh | 46 + bin/etl/jobFileProcessor.sh | 46 + bin/etl/pidfiles.sh | 110 + bin/etl/processingRecordsPrinter.sh | 35 + bin/find_framework.rb | 191 ++ bin/find_partial_jobs.rb | 57 + bin/find_partial_raw.rb | 60 + bin/get_flow.rb | 98 + bin/get_flow_events.rb | 87 + bin/get_flow_stats.rb | 108 + bin/get_job.rb | 61 + bin/get_raw.rb | 67 + bin/hraven | 194 ++ bin/hraven-daemon.sh | 194 ++ bin/job_level_statistics.rb | 225 ++ conf/hraven-env.sh | 38 + conf/log4j.properties | 40 + dev-support/hraven_eclipse_formatter.xml | 310 +++ hraven-assembly/pom.xml | 52 + hraven-assembly/src/main/assembly/all.xml | 79 + .../.settings/org.eclipse.jdt.core.prefs | 5 + hraven-core/pom.xml | 409 ++++ .../twitter/hraven/ClientObjectMapper.java | 108 + .../main/java/com/twitter/hraven/Cluster.java | 50 + .../java/com/twitter/hraven/Constants.java | 289 +++ .../main/java/com/twitter/hraven/Counter.java | 56 + .../java/com/twitter/hraven/CounterMap.java | 55 + .../main/java/com/twitter/hraven/Flow.java | 394 +++ .../java/com/twitter/hraven/FlowEvent.java | 67 + .../java/com/twitter/hraven/FlowEventKey.java | 46 + .../main/java/com/twitter/hraven/FlowKey.java | 150 ++ .../java/com/twitter/hraven/FlowQueueKey.java | 78 + .../java/com/twitter/hraven/Framework.java | 91 + .../main/java/com/twitter/hraven/JobDesc.java | 165 ++ .../com/twitter/hraven/JobDescFactory.java | 104 + .../twitter/hraven/JobDescFactoryBase.java | 127 + .../java/com/twitter/hraven/JobDetails.java | 502 ++++ .../com/twitter/hraven/JobHistoryKeys.java | 99 + .../main/java/com/twitter/hraven/JobId.java | 139 ++ .../main/java/com/twitter/hraven/JobKey.java | 149 ++ .../com/twitter/hraven/MRJobDescFactory.java | 62 + .../com/twitter/hraven/PigJobDescFactory.java | 109 + .../com/twitter/hraven/QualifiedJobId.java | 55 + .../main/java/com/twitter/hraven/Range.java | 60 + .../hraven/ScaldingJobDescFactory.java | 138 ++ .../java/com/twitter/hraven/TaskDetails.java | 259 ++ .../main/java/com/twitter/hraven/TaskKey.java | 82 + .../hraven/datasource/AppVersionService.java | 209 ++ .../hraven/datasource/ByteConverter.java | 23 + .../hraven/datasource/DataException.java | 32 + .../datasource/FlowEventKeyConverter.java | 59 + .../hraven/datasource/FlowEventService.java | 188 ++ .../hraven/datasource/FlowKeyConverter.java | 53 + .../datasource/FlowQueueKeyConverter.java | 63 + .../hraven/datasource/FlowQueueService.java | 200 ++ .../datasource/JobHistoryByIdService.java | 104 + .../datasource/JobHistoryRawService.java | 580 +++++ .../hraven/datasource/JobHistoryService.java | 711 ++++++ .../hraven/datasource/JobIdConverter.java | 42 + .../hraven/datasource/JobKeyConverter.java | 142 ++ .../MissingColumnInResultException.java | 67 + .../datasource/ProcessingException.java | 31 + .../datasource/QualifiedJobIdConverter.java | 47 + .../datasource/RowKeyParseException.java | 39 + .../hraven/datasource/RunMatchFilter.java | 112 + .../hraven/datasource/TaskKeyConverter.java | 55 + .../hraven/datasource/VersionInfo.java | 84 + .../hraven/rest/ObjectMapperProvider.java | 199 ++ .../twitter/hraven/rest/PaginatedResult.java | 102 + .../twitter/hraven/rest/RestJSONResource.java | 275 +++ .../com/twitter/hraven/rest/RestServer.java | 131 + .../hraven/rest/SerializationContext.java | 124 + .../hraven/rest/client/HRavenRestClient.java | 299 +++ .../com/twitter/hraven/util/BatchUtil.java | 121 + .../com/twitter/hraven/util/ByteUtil.java | 241 ++ .../com/twitter/hraven/util/DateUtil.java | 40 + .../com/twitter/hraven/util/JSONUtil.java | 65 + .../com/twitter/hraven/util/StringUtil.java | 44 + .../main/resources/hadoopclusters.properties | 4 + .../java/com/twitter/hraven/AllTests.java | 40 + .../twitter/hraven/GenerateFlowTestData.java | 119 + .../com/twitter/hraven/TestFramework.java | 59 + .../twitter/hraven/TestJobDescFactory.java | 42 + .../hraven/TestJobDescFactoryBase.java | 45 + .../twitter/hraven/TestJobHistoryKeys.java | 83 + .../java/com/twitter/hraven/TestJobId.java | 115 + .../java/com/twitter/hraven/TestJobKey.java | 198 ++ .../com/twitter/hraven/TestJsonSerde.java | 229 ++ .../twitter/hraven/TestMRJobDescFactory.java | 58 + .../twitter/hraven/TestPigJobDescFactory.java | 67 + .../hraven/TestScaldingJobDescFactory.java | 147 ++ .../java/com/twitter/hraven/TestTaskKey.java | 82 + .../hraven/datasource/HRavenTestUtil.java | 85 + .../datasource/TestAppVersionService.java | 194 ++ .../datasource/TestFlowEventService.java | 107 + .../datasource/TestFlowQueueKeyConverter.java | 45 + .../datasource/TestFlowQueueService.java | 123 + .../datasource/TestJobHistoryRawService.java | 240 ++ .../datasource/TestJobHistoryService.java | 302 +++ .../hraven/rest/TestPaginatedResult.java | 76 + .../twitter/hraven/util/TestBatchUtil.java | 114 + .../com/twitter/hraven/util/TestByteUtil.java | 267 ++ ...259_job_201205231531_256984_userName1_App1 | 4 + .../src/test/resources/log4j.properties | 20 + .../.settings/org.eclipse.jdt.core.prefs | 5 + hraven-etl/pom.xml | 410 ++++ .../com/twitter/hraven/etl/FileLister.java | 89 + .../etl/FileStatusModificationComparator.java | 55 + .../twitter/hraven/etl/ImportException.java | 31 + .../java/com/twitter/hraven/etl/JobFile.java | 155 ++ .../etl/JobFileModifiedRangePathFilter.java | 138 ++ .../hraven/etl/JobFilePartitioner.java | 541 +++++ .../twitter/hraven/etl/JobFilePathFilter.java | 49 + .../hraven/etl/JobFilePreprocessor.java | 428 ++++ .../twitter/hraven/etl/JobFileProcessor.java | 612 +++++ .../twitter/hraven/etl/JobFileRawLoader.java | 352 +++ .../com/twitter/hraven/etl/JobRunner.java | 92 + .../hraven/etl/MinMaxJobFileTracker.java | 138 ++ .../com/twitter/hraven/etl/ProcessRecord.java | 263 ++ .../twitter/hraven/etl/ProcessRecordKey.java | 54 + .../hraven/etl/ProcessRecordKeyConverter.java | 42 + .../hraven/etl/ProcessRecordService.java | 496 ++++ .../hraven/etl/ProcessRecordUpdater.java | 82 + .../com/twitter/hraven/etl/ProcessState.java | 81 + .../hraven/etl/ProcessingRecordsPrinter.java | 222 ++ .../mapreduce/CombineFileInputFormat.java | 626 +++++ .../mapreduce/JobFileRawLoaderMapper.java | 242 ++ .../hraven/mapreduce/JobFileTableMapper.java | 274 +++ .../hraven/mapreduce/JobHistoryListener.java | 259 ++ .../hraven/mapreduce/ProcessingCounter.java | 30 + .../apache/hadoop/mapred/JobHistoryCopy.java | 2163 +++++++++++++++++ .../com/twitter/hraven/JobConfFileTest.java | 56 + .../java/com/twitter/hraven/TestJobFile.java | 78 + .../hraven/etl/AssertHistoryListener.java | 172 ++ ...tFileStatusModificationTimeComparator.java | 66 + .../twitter/hraven/etl/TestProcessRecord.java | 81 + .../src/test/resources/log4j.properties | 20 + pom.xml | 228 ++ todo | 7 + 153 files changed, 22988 insertions(+) create mode 100644 .gitignore create mode 100644 .settings/org.eclipse.core.resources.prefs create mode 100644 .settings/org.eclipse.jdt.core.prefs create mode 100644 .settings/org.eclipse.jdt.ui.prefs create mode 100644 .settings/org.eclipse.m2e.core.prefs create mode 100644 .travis.yml create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 bin/README create mode 100755 bin/create_schema.rb create mode 100755 bin/encode_job_id.rb create mode 100755 bin/etl/hraven-etl.sh create mode 100755 bin/etl/jobFileLoader.sh create mode 100755 bin/etl/jobFilePreprocessor.sh create mode 100755 bin/etl/jobFileProcessor.sh create mode 100755 bin/etl/pidfiles.sh create mode 100755 bin/etl/processingRecordsPrinter.sh create mode 100755 bin/find_framework.rb create mode 100755 bin/find_partial_jobs.rb create mode 100755 bin/find_partial_raw.rb create mode 100755 bin/get_flow.rb create mode 100755 bin/get_flow_events.rb create mode 100755 bin/get_flow_stats.rb create mode 100755 bin/get_job.rb create mode 100755 bin/get_raw.rb create mode 100755 bin/hraven create mode 100755 bin/hraven-daemon.sh create mode 100755 bin/job_level_statistics.rb create mode 100755 conf/hraven-env.sh create mode 100644 conf/log4j.properties create mode 100644 dev-support/hraven_eclipse_formatter.xml create mode 100644 hraven-assembly/pom.xml create mode 100644 hraven-assembly/src/main/assembly/all.xml create mode 100644 hraven-core/.settings/org.eclipse.jdt.core.prefs create mode 100644 hraven-core/pom.xml create mode 100644 hraven-core/src/main/java/com/twitter/hraven/ClientObjectMapper.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Cluster.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Constants.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Counter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/CounterMap.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Flow.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/FlowEvent.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/FlowEventKey.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/FlowKey.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/FlowQueueKey.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Framework.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobDesc.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobDescFactory.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobDescFactoryBase.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobDetails.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobHistoryKeys.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobId.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/JobKey.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/MRJobDescFactory.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/PigJobDescFactory.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/QualifiedJobId.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/Range.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/ScaldingJobDescFactory.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/TaskDetails.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/TaskKey.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/AppVersionService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/ByteConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/DataException.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/FlowEventKeyConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/FlowEventService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/FlowKeyConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueKeyConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryByIdService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryRawService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryService.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/JobIdConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/JobKeyConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/MissingColumnInResultException.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/ProcessingException.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/QualifiedJobIdConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/RowKeyParseException.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/RunMatchFilter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/TaskKeyConverter.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/datasource/VersionInfo.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/ObjectMapperProvider.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/PaginatedResult.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/RestJSONResource.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/RestServer.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/SerializationContext.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/rest/client/HRavenRestClient.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/util/BatchUtil.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/util/ByteUtil.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/util/DateUtil.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/util/JSONUtil.java create mode 100644 hraven-core/src/main/java/com/twitter/hraven/util/StringUtil.java create mode 100644 hraven-core/src/main/resources/hadoopclusters.properties create mode 100644 hraven-core/src/test/java/com/twitter/hraven/AllTests.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/GenerateFlowTestData.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestFramework.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactory.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactoryBase.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJobHistoryKeys.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJobId.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJobKey.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestJsonSerde.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestMRJobDescFactory.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestPigJobDescFactory.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestScaldingJobDescFactory.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/TestTaskKey.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/HRavenTestUtil.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestAppVersionService.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowEventService.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowQueueKeyConverter.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowQueueService.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryRawService.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryService.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/rest/TestPaginatedResult.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/util/TestBatchUtil.java create mode 100644 hraven-core/src/test/java/com/twitter/hraven/util/TestByteUtil.java create mode 100644 hraven-core/src/test/resources/done/something.example.com_1337787092259_job_201205231531_256984_userName1_App1 create mode 100644 hraven-core/src/test/resources/log4j.properties create mode 100644 hraven-etl/.settings/org.eclipse.jdt.core.prefs create mode 100644 hraven-etl/pom.xml create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/FileLister.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/FileStatusModificationComparator.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ImportException.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFile.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileModifiedRangePathFilter.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePartitioner.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePathFilter.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePreprocessor.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileProcessor.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileRawLoader.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/JobRunner.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/MinMaxJobFileTracker.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecord.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKey.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKeyConverter.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordService.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordUpdater.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessState.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessingRecordsPrinter.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/mapreduce/CombineFileInputFormat.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileRawLoaderMapper.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileTableMapper.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobHistoryListener.java create mode 100644 hraven-etl/src/main/java/com/twitter/hraven/mapreduce/ProcessingCounter.java create mode 100644 hraven-etl/src/main/java/org/apache/hadoop/mapred/JobHistoryCopy.java create mode 100644 hraven-etl/src/test/java/com/twitter/hraven/JobConfFileTest.java create mode 100644 hraven-etl/src/test/java/com/twitter/hraven/TestJobFile.java create mode 100644 hraven-etl/src/test/java/com/twitter/hraven/etl/AssertHistoryListener.java create mode 100644 hraven-etl/src/test/java/com/twitter/hraven/etl/TestFileStatusModificationTimeComparator.java create mode 100644 hraven-etl/src/test/java/com/twitter/hraven/etl/TestProcessRecord.java create mode 100644 hraven-etl/src/test/resources/log4j.properties create mode 100644 pom.xml create mode 100644 todo diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..604cfe7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Eclude Eclipse files that can be generated with mvn eclipse:eclipse +.classpath +.project + +# Used as a local build/deploy dev-cycle script +deploy.sh + +# Where Maven generates its output. +*/target/* +*/build/* diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..d853f46 --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,7 @@ +#Thu Apr 19 10:31:29 PDT 2012 +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 +encoding//src/test/java=UTF-8 +encoding//src/test/resources=UTF-8 +encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..e6ffc12 --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,292 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.6 +org.eclipse.jdt.core.formatter.align_type_members_on_columns=false +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 +org.eclipse.jdt.core.formatter.alignment_for_assignment=0 +org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16 +org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 +org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 +org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0 +org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 +org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 +org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 +org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 +org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 +org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch=16 +org.eclipse.jdt.core.formatter.blank_lines_after_imports=1 +org.eclipse.jdt.core.formatter.blank_lines_after_package=1 +org.eclipse.jdt.core.formatter.blank_lines_before_field=0 +org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration=0 +org.eclipse.jdt.core.formatter.blank_lines_before_imports=1 +org.eclipse.jdt.core.formatter.blank_lines_before_member_type=1 +org.eclipse.jdt.core.formatter.blank_lines_before_method=1 +org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk=1 +org.eclipse.jdt.core.formatter.blank_lines_before_package=0 +org.eclipse.jdt.core.formatter.blank_lines_between_import_groups=1 +org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations=1 +org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_array_initializer=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_block=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_block_in_case=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_enum_constant=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_method_declaration=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_switch=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=end_of_line +org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false +org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false +org.eclipse.jdt.core.formatter.comment.format_block_comments=true +org.eclipse.jdt.core.formatter.comment.format_header=false +org.eclipse.jdt.core.formatter.comment.format_html=true +org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true +org.eclipse.jdt.core.formatter.comment.format_line_comments=true +org.eclipse.jdt.core.formatter.comment.format_source_code=true +org.eclipse.jdt.core.formatter.comment.indent_parameter_description=true +org.eclipse.jdt.core.formatter.comment.indent_root_tags=true +org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags=insert +org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter=insert +org.eclipse.jdt.core.formatter.comment.line_length=80 +org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries=true +org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries=true +org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=false +org.eclipse.jdt.core.formatter.compact_else_if=true +org.eclipse.jdt.core.formatter.continuation_indentation=2 +org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer=2 +org.eclipse.jdt.core.formatter.disabling_tag=@formatter\:off +org.eclipse.jdt.core.formatter.enabling_tag=@formatter\:on +org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line=false +org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header=true +org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases=true +org.eclipse.jdt.core.formatter.indent_empty_lines=false +org.eclipse.jdt.core.formatter.indent_statements_compare_to_block=true +org.eclipse.jdt.core.formatter.indent_statements_compare_to_body=true +org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases=true +org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch=false +org.eclipse.jdt.core.formatter.indentation.size=4 +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_label=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body=insert +org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter=insert +org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_binary_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_ellipsis=insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources=insert +org.eclipse.jdt.core.formatter.insert_space_after_unary_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter=insert +org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_binary_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_ellipsis=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while=insert +org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return=insert +org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw=insert +org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_unary_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.join_lines_in_comments=true +org.eclipse.jdt.core.formatter.join_wrapped_lines=true +org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false +org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false +org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false +org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false +org.eclipse.jdt.core.formatter.lineSplit=80 +org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false +org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false +org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 +org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve=1 +org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line=true +org.eclipse.jdt.core.formatter.tabulation.char=space +org.eclipse.jdt.core.formatter.tabulation.size=2 +org.eclipse.jdt.core.formatter.use_on_off_tags=false +org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations=false +org.eclipse.jdt.core.formatter.wrap_before_binary_operator=true +org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch=true +org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested=true diff --git a/.settings/org.eclipse.jdt.ui.prefs b/.settings/org.eclipse.jdt.ui.prefs new file mode 100644 index 0000000..26a095f --- /dev/null +++ b/.settings/org.eclipse.jdt.ui.prefs @@ -0,0 +1,4 @@ +#Thu Apr 19 18:23:59 PDT 2012 +eclipse.preferences.version=1 +formatter_profile=_Eclipse-Hadoop +formatter_settings_version=12 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..00f41f9 --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,5 @@ +#Wed Apr 18 15:46:33 PDT 2012 +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..c16d2f8 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,3 @@ +language: java +jdk: + - openjdk7 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..6479e1f --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,70 @@ +# Contributing to hRaven + +Looking to contribute something to the hRaven? Here's how you can help. + +## Bugs reports + +A bug is a _demonstrable problem_ that is caused by the code in the +repository. Good bug reports are extremely helpful - thank you! + +Guidelines for bug reports: + +1. **Use the GitHub issue search** - check if the issue has already been + reported. + +2. **Check if the issue has been fixed** - try to reproduce it using the + latest `master` or development branch in the repository. + +3. **Isolate the problem** - ideally create a reduced test case and a live + example. + +4. Please try to be as detailed as possible in your report. Include specific + information about the environment - operating system and version, versions + of Hadoop and HBase, version of hRaven - and steps required to reproduce + the issue. + + +## Feature requests & contribution enquiries + +Feature requests are welcome. But take a moment to find out whether your idea +fits with the scope and aims of the project. It's up to *you* to make a strong +case for the inclusion of your feature. Please provide as much detail and +context as possible. + +Contribution enquiries should take place before any significant pull request, +otherwise you risk spending a lot of time working on something that we might +have good reasons for rejecting. + + +## Pull requests + +Good pull requests - patches, improvements, new features - are a fantastic +help. They should remain focused in scope and avoid containing unrelated +commits. + +Make sure to adhere to the coding conventions used throughout the codebase +(indentation, accurate comments, etc.) and any other requirements (such as test +coverage). + +Please follow this process; it's the best way to get your work included in the +project: + +1. Create a new topic branch to contain your feature, change, or fix: + +2. Commit your changes in logical chunks. Provide clear and explanatory commit + messages. Use git's [interactive rebase](https://help.github.com/articles/interactive-rebase) + feature to tidy up your commits before making them public. + +3. Locally merge (or rebase) the upstream development branch into your topic branch: + +4. Push your topic branch up to your fork: + +5. [Open a Pull Request](http://help.github.com/send-pull-requests/) with a + clear title and description. + +## License + +By contributing your code, + +You agree to license your contribution under the terms of the Apache Public License 2.0 +https://github.com/twitter/hraven/blob/master/LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f4df98 --- /dev/null +++ b/README.md @@ -0,0 +1,179 @@ +hRaven [![Build Status](https://travis-ci.org/twitter/hraven.png)](https://travis-ci.org/twitter/hraven) +========== + +hRaven collects run time data and statistics from map reduce jobs running on +Hadoop clusters and stores the collected job history in an easily queryable +format. For the jobs that are run through frameworks (Pig or +Scalding/Cascading) that decompose a script or application into a DAG of map +reduce jobs for actual execution, hRaven groups job history data together by +an application construct. This allows for easier visualization of all of the +component jobs' execution for an application and more comprehensive trending +and analysis over time. + +Requirements +-------------------- + +* Apache HBase (0.94+) - a running HBase cluster is required for the hRaven + data storage +* Apache Hadoop - hRaven current supports collection of job data on specific + versions of Hadoop: + * CDH3 (up to u4) + * *Note:* CDH3u5, Hadoop 1.x, and Hadoop 2.0 are NOT YET supported, though work is in progress to fix this + +Quick start +-------------------- + +Clone the github repo or download the latest release: + + git clone git://github.com/twitter/hraven.git + +If you cloned the repository, build the full tarball: + + mvn clean package assembly:single + +Extract the assembly tarball on a machine with HBase client access. + +Create the initial schema + + hbase [--config /path/to/hbase/conf] shell bin/create_schema.rb + + +Schema +-------------------- + +hRaven requires the following HBase tables in order to store data for map +reduce jobs: + +* `job_history` - job-level statistics, one row per job +* `job_history_task` - task-level statistics, one row per task attempt +* `job_history-by_jobId` - index table pointing to `job_history` row by job ID +* `job_history_app_version` - distinct versions associated with an + application, one row per application +* `job_history_raw` - stores the raw job configuration and job history files, + as byte[] blobs +* `job_history_process` - meta table storing progress information for the data + loading process +* `flow_queue` - time based index of flows for Ambrose integration +* `flow_event` - stores flow progress events for Ambrose integration + +The initial table schema can be created by running the `create_schema.rb` +script: + + hbase [--config /path/to/hbase/conf] shell bin/create_schema.rb + + +Data Loading +-------------------- + +Currently, hRaven loads data for _completed_ map reduce jobs by reading and parsing the job history and job configuration files from HDFS. As a pre-requisite, the Hadoop Job Tracker must be configured to archive job history files in HDFS, by adding the following setting to your `mapred-site.xml` file: + + + mapred.job.tracker.history.completed.location + hdfs://:8020/hadoop/mapred/history/done + Store history and conf files for completed jobs in HDFS. + + + +Once your Job Tracker is running with this setting in place, you can load data into hRaven with a series of map reduce jobs: + +1. **JobFilePreprocessor** - scans the HDFS job history archive location for newly completed jobs; writes the new filenames to a sequence file for processing in the next stage; records the sequence file name in a new row in the `job_history_process` table +2. **JobFileRawLoader** - scans the processing table for new records from JobFileProcessor; reads the associated sequence files; writes the associated job history files for each sequence file entry into the HBase `job_history_raw` table +3. **JobFileProcessor** - reads new records from the raw table; parses the stored job history contents into individual puts for the `job_history`, `job_history_task`, and related index tables + +Each job has an associated shell script under the `bin/` directory. See these scripts for more details on the job parameters. + +REST API +-------------------- + +Once data has been loaded into hRaven tables, a REST API provides access to job data for common query patterns. hRaven ships with a simple REST server, which can be started or stopped with the command: + + ./bin/hraven-daemon.sh (start|stop) rest + +The following endpoints are currently supported: + +### Get Job + +Path: `/job/[/jobId]` +Returns: single job +Optional QS Params: n/a + +### Get Flow By JobId + +Path: `/jobFlow/[/jobId]` +Returns: the flow for the jobId +Optional QS Params - v1: + +* `limit` (default=1) + +### Get Flows + +Path: `/flow///[/version]` +Returns: list of flows +Optional QS Params - v1: + +* `limit` (default=1) - max number of flows to return +* `includeConf` - filter configuration property keys to return only the given + names +* `includeConfRegex` - filter configuration property keys to return only those + matching the given regex patterns + +### Get Flow Timeseries + +Path: `/flowStats///` +Returns: list of flows with only minimal stats +Optional QS params: + +* `version` (optional filter) +* `startRow` (base64 encoded row key) +* `startTime` (ms since epoch) - restrict results to given time window +* `endTime` (ms since epoch) - restrict results to given time window +* `limit` (default=100) - max flows to return +* `includeJobs` (boolean flag) - include per-job details + +*Note:* This endpoint duplicates functionality from the "/flow/" endpoint and + maybe be combined back in to it in the future. + + +### Get App Versions + +Path: `/appVersion///` +Returns: list of distinct app versions +Optional QS params: + +* `limit` - max results to return + + +Project Resources +-------------------- + +### Bug tracker +Have a bug? Please create an issue here on GitHub +https://github.com/twitter/hraven/issues + +### Mailing list +Have a question? Ask on our mailing list! + +*hRaven Users:* + +[hraven-user@googlegroups.com](http://groups.google.com/group/hraven-user) + +*hRaven Developers:* + +[hraven-dev@googlegroups.com](http://groups.google.com/group/hraven-dev) + +### Contributing to hRaven +For more details on how to contribute to hRaven, see CONTRIBUTING.md. + + +Known Issues +-------------------- + +1. CDH3u5 has changed the directory layout used by the Job Tracker to store history files for completed map reduce jobs. The JobFilePrepocessor needs to be updated to correctly handle the new structure. +2. hRaven needs to be updated to load job history files from Hadoop 2.0. +3. While hRaven stores the full data available from job history logs, the rolled-up statistics in the `Flow` class only represent data from sucessful task attempts. We plan to extend this so that the `Flow` class also reflects resources used by failed and killed task attempts. + +Copyright and License +--------------------- +Copyright 2013 Twitter, Inc. and other contributors + +Licensed under the Apache License Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/bin/README b/bin/README new file mode 100644 index 0000000..eb56912 --- /dev/null +++ b/bin/README @@ -0,0 +1,73 @@ +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Running JRuby Scripts +------------------------------ +JRuby scripts in this directory make use of either the hRaven java APIs +or the HBase client java API directly. + +To run the scripts: + +1) Make sure the HRAVEN_CLASSPATH environment variable is set to include hbase, +zookeeper and Hadoop jars. For example by running: + + export HRAVEN_CLASSPATH=`hbase classpath` + +2) Run the ruby scripts from the bin directory: + cd hraven/bin + ./script_name.rb + +3) Or to run the script from a different directory you'll have to pass +org.jruby.Main class and the qualified path to the ruby script: + somepath/hraven/bin/hraven org.jruby.Main somepath/hraven/bin/script_name.rb + + +4) When you want to connect to a different HBase cluster, set the HBASE_CONF_DIR +environment variable to the desired HBase configuration: + + export HBASE_CONF_DIR=/etc/hbase/conf-hbase-dc1 + cd hraven/bin + ./script_name.rb + +or all on one line without exporting the environment variable: + + cd hraven/bin + HBASE_CONF_DIR=/etc/hbase/conf-hbase-dc1 ./script_name.rb + + +Using the HBase Shell +------------------------------ +The HBase shell for a specific cluster can be launched by pointing to +the cluster's configuration, ie. to run a shell on the test cluster: + + hbase --config /etc/hbase/conf-hbase-dc1 shell + + +Since many of the table row keys contain encoded versions of the job +ID, these can be difficult to access from the shell. To assist with +generating the encoded job IDs to use in the shell, run the script: + + ./encode_job_id.rb [cluster] jobid + +For example: + + $ ./encode_job_id.rb job_201204041958_222793 + \x00\x00\x00.\xD8\xB2\x08\xE6\x00\x00\x00\x00\x00\x03fI + +Then copy and paste the result, for use in shell commands. When using +the byte encoded representation, the field _must_ be enclosed in double +quotes to be handled correct in the JRuby interpreter that runs the +HBase shell: + + > get 'dev.job_history_raw', "cluster@dc1!\x00\x00\x00.\xD8\xB2\x08\xE6\x00\x00\x00\x00\x00\x03fI" diff --git a/bin/create_schema.rb b/bin/create_schema.rb new file mode 100755 index 0000000..2f0a454 --- /dev/null +++ b/bin/create_schema.rb @@ -0,0 +1,50 @@ +#!./hraven org.jruby.Main + +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Create all hRaven tables in HBase +# +# Run this script using the HBase "shell" command: +# +# hbase [--config /path/to/hbase/conf] shell bin/create_table.rb +# +create 'job_history', {NAME => 'i', COMPRESSION => 'LZO'} + +create 'job_history_task', {NAME => 'i', COMPRESSION => 'LZO'} + +# job_history (indexed) by jobId table contains 1 column family: +# i: job-level information specifically the rowkey into the +create 'job_history-by_jobId', {NAME => 'i', COMPRESSION => 'LZO'} + +# job_history_app_version - stores all version numbers seen for a single app ID +# i: "info" -- version information +create 'job_history_app_version', {NAME => 'i', COMPRESSION => 'LZO'} + +create 'job_history_raw', {NAME => 'i', COMPRESSION => 'LZO', BLOOMFILTER => 'ROWCOL'}, + {NAME => 'r', VERSIONS => 1, COMPRESSION => 'LZO', BLOCKCACHE => false} + +# job_history_process - stores metadata about job history data loading process +# i: "info" -- process information +create 'job_history_process', {NAME => 'i', VERSIONS => 10, COMPRESSION => 'LZO'} + +# flow_queue - stores reference to each flow ID running on a cluster, reverse timestamp ordered +create 'flow_queue', {NAME => 'i', VERSIONS => 3, COMPRESSION => 'LZO', BLOOMFILTER => 'ROW'} + +# flow_event - stores events fired during pig job execution +create 'flow_event', {NAME => 'i', VERSIONS => 3, COMPRESSION => 'LZO', BLOOMFILTER => 'ROW'} + + +exit diff --git a/bin/encode_job_id.rb b/bin/encode_job_id.rb new file mode 100755 index 0000000..092c9f1 --- /dev/null +++ b/bin/encode_job_id.rb @@ -0,0 +1,45 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Show the byte respresentation for a job ID, which is comprised of a cluster identifier and a jobid. +# +# Execute this script from the bin directory like this: +# hraven/bin$ ./encode_job_id.rb [cluster] jobid +# +# Or from anywhere like this: +# hraven$ bin/hraven org.jruby.Main bin/encode_job_id.rb [cluster] jobid + +include Java + +import com.twitter.hraven.datasource.JobIdConverter +import com.twitter.hraven.datasource.QualifiedJobIdConverter +import com.twitter.hraven.JobId +import com.twitter.hraven.QualifiedJobId + +import org.apache.hadoop.hbase.util.Bytes + +if ARGV.length == 2 + id = QualifiedJobId.new(ARGV[0], ARGV[1]) + puts Bytes.toStringBinary(QualifiedJobIdConverter.new().toBytes(id)) +elsif ARGV.length == 1 + id = JobId.new(ARGV[0]) + puts Bytes.toStringBinary(JobIdConverter.new().toBytes(id)) +else + puts "Usage: encode_job_id.rb [cluster] jobid" + exit 1 +end diff --git a/bin/etl/hraven-etl.sh b/bin/etl/hraven-etl.sh new file mode 100755 index 0000000..aa7551f --- /dev/null +++ b/bin/etl/hraven-etl.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Used pre-process unprocessed files in the /hadoop/mapred/history/done directory +# For each batch jobFiles, write a sequence file in /hadoop/mapred/history/processing/ +# listing the jobFiles to be loaded and create corresponding process Record. +# +# Load all the jobFiles listed in the process file from the process record into HBase +# +# Script Use: +# 1. Set below parameters to correct values for execution environment +# 2. Run using "./hraven-etl.sh" +# + +# Parameters +########## FILL IN APPROPRIATE VALUES BELOW ########## +cluster="mycluster" +mapredmaxsplitsize="204800" +batchsize="100" +schedulerpoolname="mypool" +threads="20" +hadoopconfdir=${HADOOP_CONF_DIR:-$HADOOP_HOME/conf} +hbaseconfdir=${HBASE_CONF_DIR:-$HBASE_HOME/conf} +# HDFS directories for processing and loading job history data +historyRawDir=/hadoop/mapred/history/done +historyProcessingDir=/hadoop/mapred/history/processing/ +####################################################### + +home=$(dirname $0) + +# set the hraven-core jar as part of libjars and hadoop classpath +# set this here because it only pertains to the etl logic +export LIBJARS=$home/../../lib/hraven-core.jar +export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$LIBJARS +hravenEtlJar=$home/../../lib/hraven-etl.jar + +source $home/../../conf/hraven-env.sh +source $home/pidfiles.sh + +# Each job has 2 files: history and config +batchsizejobs=$(($batchsize / 2)) + +myscriptname=$(basename "$0" .sh) +stopfile=$HRAVEN_PID_DIR/$myscriptname.stop +if [ -f $stopfile ]; then + echo "Error: not allowed to run. Remove $stopfile continue." 1>&2 + exit 1 +fi + +create_pidfile $HRAVEN_PID_DIR +trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT + +# Pre-process +$home/jobFilePreprocessor.sh $hadoopconfdir $historyRawDir $historyProcessingDir $cluster $batchsize + +# Load +$home/jobFileLoader.sh $hadoopconfdir $mapredmaxsplitsize $schedulerpoolname $cluster $historyProcessingDir + +# Process +$home/jobFileProcessor.sh $hbaseconfdir $schedulerpoolname $historyProcessingDir $cluster $threads $batchsize \ No newline at end of file diff --git a/bin/etl/jobFileLoader.sh b/bin/etl/jobFileLoader.sh new file mode 100755 index 0000000..a31fb6a --- /dev/null +++ b/bin/etl/jobFileLoader.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Run on the daemon node for the specific cluster. +# Usage ./jobFileLoader.sh [hadoopconfdir] +# [maxsplitsize] [schedulerpoolname] [cluster] [historyprocessingdir] + +if [ $# -ne 5 ] +then + echo "Usage: `basename $0` [hadoopconfdir] [maxsplitsize] [schedulerpoolname] [cluster] [historyprocessingdir]" + exit 1 +fi + +home=$(dirname $0) +source $home/../../conf/hraven-env.sh +source $home/pidfiles.sh +myscriptname=$(basename "$0" .sh) +hravenEtlJar=$home/../../lib/hraven-etl.jar +stopfile=$HRAVEN_PID_DIR/$myscriptname.stop +LIBJARS=$home/../../lib/hraven-core.jar + +if [ -f $stopfile ]; then + echo "Error: not allowed to run. Remove $stopfile continue." 1>&2 + exit 1 +fi + +create_pidfile $HRAVEN_PID_DIR +trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT + +hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.JobFileRawLoader -libjars=$LIBJARS -Dmapred.max.split.size=$2 -Dmapred.fairscheduler.pool=$3 -d -c $4 -p $5 diff --git a/bin/etl/jobFilePreprocessor.sh b/bin/etl/jobFilePreprocessor.sh new file mode 100755 index 0000000..cb16d76 --- /dev/null +++ b/bin/etl/jobFilePreprocessor.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Run on the daemon node per specific cluster +# Usage ./jobFilePreprocessor.sh [hadoopconfdir] +# [historyrawdir] [historyprocessingdir] [cluster] [batchsize] + +if [ $# -ne 5 ] +then + echo "Usage: `basename $0` [hadoopconfdir] [historyrawdir] [historyprocessingdir] [cluster] [batchsize]" + exit 1 +fi + +home=$(dirname $0) +source $home/../../conf/hraven-env.sh +source $home/pidfiles.sh +myscriptname=$(basename "$0" .sh) +stopfile=$HRAVEN_PID_DIR/$myscriptname.stop +hravenEtlJar=$home/../../lib/hraven-etl.jar +LIBJARS=$home/../../lib/hraven-core.jar +export HADOOP_HEAPSIZE=4000 +export HADOOP_CLASSPATH=$(ls $home/../../lib/commons-lang-*.jar) + +if [ -f $stopfile ]; then + echo "Error: not allowed to run. Remove $stopfile continue." 1>&2 + exit 1 +fi + +create_pidfile $HRAVEN_PID_DIR +trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT + +hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.JobFilePreprocessor -libjars=$LIBJARS -d -i $2 -o $3 -c $4 -b $5 \ No newline at end of file diff --git a/bin/etl/jobFileProcessor.sh b/bin/etl/jobFileProcessor.sh new file mode 100755 index 0000000..c7124c2 --- /dev/null +++ b/bin/etl/jobFileProcessor.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Run on the daemon node per specific cluster +# This script runs on the HBase cluster +# Usage ./jobFileProcessor.sh [hadoopconfdir] +# [schedulerpoolname] [historyprocessingdir] [cluster] [threads] [batchsize] + +if [ $# -ne 6 ] +then + echo "Usage: `basename $0` [hbaseconfdir] [schedulerpoolname] [historyprocessingdir] [cluster] [threads] [batchsize]" + exit 1 +fi + +home=$(dirname $0) +source $home/../../conf/hraven-env.sh +source $home/pidfiles.sh +myscriptname=$(basename "$0" .sh) +hravenEtlJar=$home/../../lib/hraven-etl.jar +LIBJARS=$home/../../lib/hraven-core.jar +stopfile=$HRAVEN_PID_DIR/$myscriptname.stop +export HADOOP_CLASSPATH=$(ls $home/../../lib/commons-lang-*.jar) + +if [ -f $stopfile ]; then + echo "Error: not allowed to run. Remove $stopfile continue." 1>&2 + exit 1 +fi + +create_pidfile $HRAVEN_PID_DIR +trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT + +hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.JobFileProcessor -libjars=$LIBJARS -Dmapred.fairscheduler.pool=$2 -d -p $3 -c $4 -t $5 -b $6 \ No newline at end of file diff --git a/bin/etl/pidfiles.sh b/bin/etl/pidfiles.sh new file mode 100755 index 0000000..0491be2 --- /dev/null +++ b/bin/etl/pidfiles.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script does not do anything by itself, but includes functions for dealing with PID file locks. +# +# Usage: +## Pull in functions to manage pid files. +# source $(dirname $0)/pidfiles.sh +## Race to get the lock +# create_pidfile +## Make sure we clean up when done (even when killed, except with -9): +# trap 'cleanup_pidfile_and_exit' INT TERM EXIT + +# +# Create the process file or exit if the previous process is still running. +# Will also exit if we cannot write the pidfile, or delete a previously abandoned one. +# In case of a previously abandoned pidfile, we re-launch ourselves in the background. +# +function create_pidfile() { + mypid=$$ + myscriptname=$(basename "$0" .sh) + pidfile=$1/$myscriptname.pid + # Close stderr so no garbage goes into pidfile, + # then write mypid atomically into the PID file, or fail to write if already there + $(exec 2>&-; set -o noclobber; echo "$mypid" > "$pidfile") + # Check if the lockfile is present + if [ ! -f "$pidfile" ]; then + # pidfile should exist, if not, we failed to create so to bail out + echo pidFile does not exist, exiting + exit 1 + fi + # Read the pid from the file + currentpid=$(<"$pidfile") + # Is the recorded pid me? + if [ $mypid -ne $currentpid ]; then + # It is not me. Is the process pid in the lockfile still running? + is_already_running "$pidfile" + if [ $? -ne 0 ]; then + # No. Kill the pidfile and relaunch ourselves properly. + rm "$pidfile" + if [ $? -ne 0 ]; then + echo "Error: unable to delete pidfile $pidfile" 1>&2 + else + # fork only if we can delete the pidfile to prevent fork-bomb + $0 $@ & + fi + fi + # We did not own the pid in the pidfile. + exit + fi +} + +# +# Clean up the pidfile that we owned and exit +# After creating the pidfile, call this as: +# trap 'cleanup_pidfile_and_exit' INT TERM EXIT +# +function cleanup_pidfile_and_exit() { + myscriptname=$(basename "$0" .sh) + pidfile=$1/$myscriptname.pid + if [ -f "$pidfile" ]; then + rm "$pidfile" + if [ $? -ne 0 ]; then + echo "Error: unable to delete pidfile $pidfile" 1>&2 + fi + fi + exit +} + +# +# For internal use only +# +# param: the pidfile +# returns boolean 0|1 (1=no, 0=yes) +# +function is_already_running() { + pidfile="$1" + if [ ! -f "$pidfile" ]; then + # pid file does not exist + return 1 + fi + pid=$(<"$pidfile") + if [ -z "$pid" ]; then + # pid file did not contain a pid + return 1 + fi + + # check if a process with this pid exists and is an instance of this script + previous=$(ps -p $pid | grep $(basename $0)) + if [ "$previous" = "" ]; then + # There is no such process running, or the pid is not us + return 1 + else + return 0 + fi +} \ No newline at end of file diff --git a/bin/etl/processingRecordsPrinter.sh b/bin/etl/processingRecordsPrinter.sh new file mode 100755 index 0000000..f2dc1d7 --- /dev/null +++ b/bin/etl/processingRecordsPrinter.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Used to just dump out the process records in a readable form for a given cluster. +# Used for manual debugging and verification. + +# Run on the daemon node per specific cluster +# Usage ./processingRecordsPrinter.sh [hbaseconfdir] [cluster] + +if [ $# -ne 2 ] +then + echo "Usage: `basename $0` [hbaseconfdir] [cluster]" + exit 1 +fi + +home=$(dirname $0) +source $home/../../conf/hraven-env.sh +hravenEtlJar=$home/../../lib/hraven-etl.jar +LIBJARS=$home/../../lib/hraven-core.jar + +hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.ProcessingRecordsPrinter -libjars=$LIBJARS -c $2 diff --git a/bin/find_framework.rb b/bin/find_framework.rb new file mode 100755 index 0000000..9bb7857 --- /dev/null +++ b/bin/find_framework.rb @@ -0,0 +1,191 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Prints out records from the dev.job_history table for a given framework. Optionally, filters +# can be passed and specific columnscan be requested. To see full usage run this wihtout arguments: +# +# $ hbase org.jruby.Main bin/find_framework.rb +# +# To point to a non-default HBase cluster pass --config: +# +# $ hbase --config /etc/hbase/conf-hbase-dc1 org.jruby.Main bin/find_framework.rb +# + +include Java + +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.HTable +import org.apache.hadoop.hbase.client.Scan +import org.apache.hadoop.hbase.filter.FilterList +import org.apache.hadoop.hbase.filter.CompareFilter +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter +import org.apache.hadoop.hbase.util.Bytes + +import com.twitter.hraven.Constants + +require 'optparse' + +options = { :null_number => "", :null_string => ""} + +opts = OptionParser.new do |opts| + opts.banner = "Usage: ./find_framework.rb -f [framework] " + + opts.on("-f", "--framework FRAMEWORK", String, + "The MR framework to filter on (PIG|SCALDING|NONE)") do |o| + options[:framework] = o + end + opts.on("-l", "--limit N", Integer, "Max number of results to return") do |o| + options[:limit] = o + end + opts.on("-c", "--columns COLUMNS", String, "List of column descriptors to return (comma-delimited)") do |o| + options[:columns] = o.split(",") + end + opts.on("-s", "--start START", Integer, "Minumum timestamp (ms) to filter on") do |o| + options[:start] = o + end + opts.on("-e", "--end END", Integer, "Maximum timestamp (ms) to filter on") do |o| + options[:end] = o + end + opts.on("-N", "--null_number STRING", "Value to output when a null number is found (default is empty string)") do |o| + options[:null_number] = o + end + opts.on("-S", "--null_string STRING", "Value to output when a null string is found (default is empty string)") do |o| + options[:null_string] = o + end + opts.on_tail("-h", "--help", "Show this message") do + puts opts + exit + end +end + +opts.parse!(ARGV) + +# TODO: there must be a better way to specify a required options +if !options[:framework] + puts "Missing required field: framework" + puts opts + exit +end + +framework = options[:framework] +limit = options[:limit] || -1 + +STDERR.write "Limiting number of rows returned to #{limit}\n" if limit > -1 + +if framework == "SCALDING" + cols = options[:columns] || + ["c!scalding.flow.submitted.timestamp", "total_maps", "total_reduces", + "g!FileSystemCounters!HDFS_BYTES_READ", "gm!org.apache.hadoop.mapred.Task$Counter!MAP_OUTPUT_BYTES", + "c!cascading.app.id", "c!cascading.app.name", "c!cascading.app.version", + "c!cascading.flow.id", "c!cascading.app.id", "c!mapred.job.name", "c!scalding.flow.class.signature"] +elsif framework == "NONE" + cols = options[:columns] || ["c!mapred.job.name", "c!batch.desc"] +elsif framework == "PIG" + cols = options[:columns] || + ["c!pig.script.submitted.timestamp", "total_maps", "total_reduces", + "g!FileSystemCounters!HDFS_BYTES_READ", "gm!org.apache.hadoop.mapred.Task$Counter!MAP_OUTPUT_BYTES", + "gr!FileSystemCounters!FILE_BYTES_READ", + "g!org.apache.hadoop.mapred.JobInProgress$Counter!SLOTS_MILLIS_MAPS", + "g!org.apache.hadoop.mapred.JobInProgress$Counter!SLOTS_MILLIS_REDUCES", "job_status", + "c!mapred.job.name", "c!batch.desc"] +else + puts "Unknown framework: #{framework}, must be one of PIG, SCALDING, NONE" + exit 1 +end + +date_descriptor = "submit_time" +# we need to fetch date if date range is passed and that column isn't requested +if (options[:start] || options[:end]) + cols << date_descriptor if !cols.include?(date_descriptor) +end + +colnames = cols.join("\t") +colbytes = cols.collect {|x| Bytes.toBytes(x)} + +c = HBaseConfiguration.create() +historyTable = HTable.new(c, Constants.HISTORY_TABLE_BYTES) + +scan = Scan.new +colbytes.each { |x| + scan.addColumn(Constants::INFO_FAM_BYTES, x) +} +scan.addColumn(Constants::INFO_FAM_BYTES, Constants::FRAMEWORK_COLUMN_BYTES) + +# by filtering to return only empty job IDs and _not_ setting filter if missing, we should only get rows missing jobid +frameworkFilter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, + Constants::FRAMEWORK_COLUMN_BYTES, CompareFilter::CompareOp::EQUAL, Bytes.toBytes(framework)) +frameworkFilter.setFilterIfMissing(true) + +filters = [frameworkFilter] +STDERR.write "Filtering where framework = #{framework}\n" + +# date range filters +if options[:start] + STDERR.write "Filtering where #{date_descriptor} >= #{options[:start]}\n" + filter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, Bytes.toBytes(date_descriptor), + CompareFilter::CompareOp::GREATER_OR_EQUAL, Bytes.toBytes(options[:start])) + filter.setFilterIfMissing(true) + filters << filter +end +if options[:end] + STDERR.write "Filtering where #{date_descriptor} < #{options[:end]}\n" + filter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, Bytes.toBytes(date_descriptor), + CompareFilter::CompareOp::LESS, Bytes.toBytes(options[:end])) + filter.setFilterIfMissing(true) + filters << filter +end + +filterList = FilterList.new(filters) +scan.setFilter(filterList) + +scanner = historyTable.getScanner(scan) +rowcnt = 0 + +puts "#{colnames}\trowkey" +scanner.each { |result| + break if result.nil? || result.isEmpty || (limit > 0 && rowcnt >= limit) + rowcnt += 1 + colindex = 0 + vals = colbytes.collect { |x| + val = result.getValue(Constants::INFO_FAM_BYTES, x) + if cols[colindex].start_with?("g!") || + cols[colindex].start_with?("gm!") || + cols[colindex].start_with?("gr!") || + cols[colindex].start_with?("total") + if val.nil? + valstr = options[:null_number] + else + valstr = Bytes.toLong(val) + end + else + if val.nil? + valstr = options[:null_string] + else + valstr = Bytes.toString(val) + end + end + colindex += 1 + valstr + } + + rowkey = Bytes.toStringBinary(result.getRow()) + values = vals.join("\t") + puts "#{values}\t#{rowkey}" +} + +STDERR.write "rows found: #{rowcnt}\n" diff --git a/bin/find_partial_jobs.rb b/bin/find_partial_jobs.rb new file mode 100755 index 0000000..a1213cd --- /dev/null +++ b/bin/find_partial_jobs.rb @@ -0,0 +1,57 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Prints out incomplete records from the dev.job_history table +# Incomplete records have no 'jobid' column, indicating that +# the job history file has not been loaded +# + +include Java + +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.HTable +import org.apache.hadoop.hbase.client.Scan +import org.apache.hadoop.hbase.filter.CompareFilter +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter +import org.apache.hadoop.hbase.util.Bytes + +import com.twitter.hraven.Constants +import com.twitter.hraven.datasource.JobKeyConverter + + +c = HBaseConfiguration.create() +historyTable = HTable.new(c, Constants.HISTORY_TABLE_BYTES) +jobidBytes = Bytes.toBytes("jobid") + +scan = Scan.new +# by filtering to return only empty job IDs and _not_ setting filter if missing, we should only get rows missing jobid +scan.setFilter(SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, jobidBytes, CompareFilter::CompareOp::EQUAL, Constants::EMPTY_BYTES)) + +scanner = historyTable.getScanner(scan) +rowcnt = 0 +keyConv = JobKeyConverter.new + +scanner.each { |result| + break if result.nil? || result.isEmpty + rowcnt += 1 + rowkey = Bytes.toStringBinary(result.getRow()) + jobid = Bytes.toString(result.getValue(Constants::INFO_FAM_BYTES, jobidBytes)) + puts "#{rowkey}\t#{keyConv.fromBytes(result.getRow())}\t#{jobid}" +} + +puts "Found #{rowcnt} matching jobs" diff --git a/bin/find_partial_raw.rb b/bin/find_partial_raw.rb new file mode 100755 index 0000000..216dd20 --- /dev/null +++ b/bin/find_partial_raw.rb @@ -0,0 +1,60 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Prints out incomplete records from the dev.job_history table +# Incomplete records have no 'jobid' column, indicating that +# the job history file has not been loaded +# + +include Java + +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.HTable +import org.apache.hadoop.hbase.client.Scan +import org.apache.hadoop.hbase.filter.CompareFilter +import org.apache.hadoop.hbase.filter.FilterList +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter +import org.apache.hadoop.hbase.util.Bytes + +import com.twitter.hraven.Constants +import com.twitter.hraven.datasource.QualifiedJobIdConverter + + +c = HBaseConfiguration.create() +historyTable = HTable.new(c, Constants.HISTORY_RAW_TABLE_BYTES) + +scan = Scan.new +# by filtering to return only empty job conf or history and _not_ setting filter if missing, we should only get rows missing raw fields +filterList = FilterList.new(FilterList::Operator::MUST_PASS_ONE) +filterList.addFilter(SingleColumnValueFilter.new(Constants::RAW_FAM_BYTES, Constants::JOBCONF_COL_BYTES, CompareFilter::CompareOp::EQUAL, Constants::EMPTY_BYTES)) +filterList.addFilter(SingleColumnValueFilter.new(Constants::RAW_FAM_BYTES, Constants::JOBHISTORY_COL_BYTES, CompareFilter::CompareOp::EQUAL, Constants::EMPTY_BYTES)) +scan.setFilter(filterList) + +scanner = historyTable.getScanner(scan) +rowcnt = 0 +keyConv = QualifiedJobIdConverter.new + +scanner.each { |result| + break if result.nil? || result.isEmpty + rowcnt += 1 + rowkey = Bytes.toStringBinary(result.getRow()) + jobid = keyConv.fromBytes(result.getRow()) + puts "#{rowkey}\t#{jobid}" +} + +puts "Found #{rowcnt} matching jobs" diff --git a/bin/get_flow.rb b/bin/get_flow.rb new file mode 100755 index 0000000..597d011 --- /dev/null +++ b/bin/get_flow.rb @@ -0,0 +1,98 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Outputs the most recent flow for the given user and appId +# +require 'optparse' +include Java + +import java.util.Date +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.util.Bytes +import com.twitter.hraven.datasource.JobHistoryService +import com.twitter.hraven.datasource.JobKeyConverter +import com.twitter.hraven.rest.ObjectMapperProvider + +options = {} +options[:tasks] = false +options[:limit] = 1 +options[:revision] = nil +options[:json] = false +OptionParser.new do |opts| + opts.banner = "Usage: ./get_flow.rb [options] cluster user app" + + opts.on("-t", "--tasks", "Include task data") do |t| + options[:tasks] = t + end + opts.on("-l", "--limit N", Integer, "Return up to N flows (defaults to 1)") do |n| + options[:limit] = n + end + opts.on("-r", "--revision [REV]", "Only match the given application version") do |r| + options[:revision] = r + end + opts.on("-j", "--json", "Print retrieved flow in JSON format") do |j| + options[:json] = j + end +end.parse! + +def print_json(flows) + mapper = ObjectMapperProvider.createCustomMapper + flows_json = mapper.writeValueAsString(flows) + puts flows_json +end + +def print_text(flows) + keyConv = JobKeyConverter.new + flowcnt = 0 + flows.each { |flow| + flowcnt += 1 + puts "Flow #{flowcnt}: #{flow.getAppId()}, run by #{flow.getUserName()} at #{Date.new(flow.getRunId())} (#{flow.getRunId}), #{flow.getJobs().size()} jobs" + puts + jobcnt = 0 + flow.getJobs().each { |job| + jobcnt += 1 + puts "Job #{jobcnt}: #{job.getJobId()} #{job.getJobName()} #{job.getStatus()}" + puts "\tkey: #{Bytes.toStringBinary(keyConv.toBytes(job.getJobKey()))}" + puts "\tsubmitted: #{job.getSubmitDate()} launched: #{job.getLaunchDate()} finished: #{job.getFinishDate()} runtime: #{job.getRunTime()} ms" + puts "\tmaps: #{job.getTotalMaps()} (#{job.getFinishedMaps()} finished / #{job.getFailedMaps()} failed)" + puts "\treduces: #{job.getTotalReduces()} (#{job.getFinishedReduces()} finished / #{job.getFailedReduces()} failed)" + puts + } + } +end + +cluster = ARGV[0] +user = ARGV[1] +app = ARGV[2] + +conf = HBaseConfiguration.create() +service = JobHistoryService.new(conf) + +flows = service.getFlowSeries(cluster, user, app, options[:revision], options[:tasks], options[:limit]) +service.close() + +if flows.nil? + puts "No flows found for user: #{user}, app: #{app}" +else + if options[:json] + print_json(flows) + else + print_text(flows) + end +end + + diff --git a/bin/get_flow_events.rb b/bin/get_flow_events.rb new file mode 100755 index 0000000..7c67fd3 --- /dev/null +++ b/bin/get_flow_events.rb @@ -0,0 +1,87 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Outputs the most recent flow for the given user and appId +# +require 'optparse' +include Java + +import java.text.SimpleDateFormat +import java.util.Date +import org.apache.hadoop.hbase.HBaseConfiguration +import com.twitter.hraven.datasource.FlowEventService +import com.twitter.hraven.FlowKey + +options = {} +options[:follow] = false +options[:sleepdelay] = 5 +OptionParser.new do |opts| + opts.banner = "Usage: ./get_flow_events.rb [options] cluster user app runtimestamp" + + opts.on("-f", "--follow", "Poll for new events") do |t| + options[:follow] = t + end + opts.on("-s", "--sleep N", Integer, "Wait N seconds between attempts when polling (defaults to 5)") do |n| + options[:delay] = n + end + +end.parse! + +DF = SimpleDateFormat.new("MM-dd-yyyy HH:mm:ss") +def show_events(events) + events.each{ |e| + eventTime = DF.format(e.getTimestamp()) + puts "#{e.getFlowEventKey().getSequence()}: #{eventTime} type=#{e.getType()} data=#{e.getEventDataJSON()}" + } +end + +conf = HBaseConfiguration.create +service = FlowEventService.new(conf) + +cluster = ARGV[0] +user = ARGV[1] +app = ARGV[2] +runts = ARGV[3] + +fk = FlowKey.new(cluster, user, app, runts.to_i) + +begin + events = service.getFlowEvents(fk) + show_events(events) + + if options[:follow] + last_e = nil + while true + sleep options[:sleepdelay] + puts "..." + # continue from last event + if events.size() > 0 + last_e = events.get(events.size()-1) + end + if !last_e.nil? + events = service.getFlowEventsSince(last_e.getFlowEventKey()) + else + # no events seen yet + events = service.getFlowEvents(fk) + end + show_events(events) + end + end +ensure + service.close() unless service.nil? +end diff --git a/bin/get_flow_stats.rb b/bin/get_flow_stats.rb new file mode 100755 index 0000000..f429d31 --- /dev/null +++ b/bin/get_flow_stats.rb @@ -0,0 +1,108 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Outputs the most recent flow for the given user and appId +# +require 'optparse' +include Java + +import java.util.Date +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.util.Bytes +import com.twitter.hraven.datasource.JobHistoryService +import com.twitter.hraven.datasource.JobKeyConverter +import com.twitter.hraven.rest.ObjectMapperProvider + +options = {} +options[:tasks] = false +options[:limit] = 1 +options[:revision] = nil +options[:json] = false +OptionParser.new do |opts| + opts.banner = "Usage: get_flow.rb [options] cluster user app" + + opts.on("-d", "--detail", "Include job details") do |d| + options[:detail] = True + end + opts.on("-l", "--limit N", Integer, "Return up to N flows (defaults to 1)") do |n| + options[:limit] = n + end + opts.on("-r", "--revision [REV]", "Only match the given application version") do |r| + options[:revision] = r + end + opts.on("-j", "--json", "Print retrieved flow in JSON format") do |j| + options[:json] = j + end + opts.on("-s", "--start [TIME]", Integer, "Start time (in millis since epoch)") do |s| + options[:starttime] = s + end + opts.on("-e", "--end [TIME]", Integer, "End time (in millis since epoch)") do |e| + options[:endtime] = e + end +end.parse! + +def print_json(flows) + mapper = ObjectMapperProvider.createCustomMapper + flows_json = mapper.writeValueAsString(flows) + puts flows_json +end + +def print_text(flows, includeJobs) + keyConv = JobKeyConverter.new + flowcnt = 0 + flows.each { |flow| + flowcnt += 1 + puts "Flow #{flowcnt}: #{flow.getAppId()}, run by #{flow.getUserName()} at #{Date.new(flow.getRunId())} (#{flow.getRunId}), #{flow.getJobs().size()} jobs" + puts + if includeJobs + jobcnt = 0 + flow.getJobs().each { |job| + jobcnt += 1 + puts "Job #{jobcnt}: #{job.getJobId()} #{job.getJobName()} #{job.getStatus()}" + puts "\tkey: #{Bytes.toStringBinary(keyConv.toBytes(job.getJobKey()))}" + puts "\tsubmitted: #{job.getSubmitDate()} launched: #{job.getLaunchDate()} finished: #{job.getFinishDate()} runtime: #{job.getRunTime()} ms" + puts "\tmaps: #{job.getTotalMaps()} (#{job.getFinishedMaps()} finished / #{job.getFailedMaps()} failed)" + puts "\treduces: #{job.getTotalReduces()} (#{job.getFinishedReduces()} finished / #{job.getFailedReduces()} failed)" + puts + } + end + } +end + +cluster = ARGV[0] +user = ARGV[1] +app = ARGV[2] + +conf = HBaseConfiguration.create() +#conf.set('hbase.client.scanner.caching', '1000') +service = JobHistoryService.new(conf) + +flows = service.getFlowTimeSeriesStats(cluster, user, app, options[:revision], options[:starttime], options[:endtime], options[:limit], nil) +service.close() + +if flows.nil? + puts "No flows found for user: #{user}, app: #{app}" +else + if options[:json] + print_json(flows) + else + print_text(flows, options[:detail]) + end +end + + diff --git a/bin/get_job.rb b/bin/get_job.rb new file mode 100755 index 0000000..b740180 --- /dev/null +++ b/bin/get_job.rb @@ -0,0 +1,61 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Outputs a single job's details +# +require 'optparse' +include Java + +import org.apache.hadoop.hbase.HBaseConfiguration +import com.twitter.hraven.datasource.JobHistoryService + +options = {} +options[:tasks] = false +OptionParser.new do |opts| + opts.banner = "Usage: ./get_job.rb [options] cluster jobId" + + opts.on("-t", "--tasks", "Include task data") do |t| + options[:tasks] = t + end +end.parse! + +cluster = ARGV[0] +jobid = ARGV[1] + +conf = HBaseConfiguration.create() +service = JobHistoryService.new(conf) + +job = service.getJobByJobID(cluster, jobid, options[:tasks]) +service.close() + +if job.nil? + puts "No job found for cluster: #{cluster}, jobid: #{jobid}" +else + puts "Job: #{job.getJobId()} #{job.getJobName()} #{job.getStatus()}" + puts "\tsubmitted: #{job.getSubmitDate()} launched: #{job.getLaunchDate()} finished: #{job.getFinishDate()} runtime: #{job.getRunTime()} ms" + puts "\tmaps: #{job.getTotalMaps()} (#{job.getFinishedMaps()} finished / #{job.getFailedMaps()} failed)" + puts "\treduces: #{job.getTotalReduces()} (#{job.getFinishedReduces()} finished / #{job.getFailedReduces()} failed)" + if options[:tasks] + puts "Tasks:" + job.getTasks().each { |task| + puts "\t#{task.getTaskId()}: #{task.getTaskAttemptId()} type: #{task.getType()} status: #{task.getStatus()}" + } + end +end + + diff --git a/bin/get_raw.rb b/bin/get_raw.rb new file mode 100755 index 0000000..44c3261 --- /dev/null +++ b/bin/get_raw.rb @@ -0,0 +1,67 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Outputs a single job's details +# +require 'optparse' +include Java + +import java.lang.System +import org.apache.hadoop.hbase.HBaseConfiguration +import com.twitter.hraven.datasource.JobHistoryRawService +import com.twitter.hraven.QualifiedJobId + +options = {} + +OptionParser.new do |opts| + opts.banner = "Usage: ./get_raw.rb [options] cluster jobId" + + opts.on("-t", "--type TYPE", [:conf, :history], "Raw field to output (conf, history)") do |t| + options[:type] = t + end + opts.on("-f", "--file [FILENAME]", "Write the raw field to the file FILENAME") do |f| + options[:filename] = f + end +end.parse! + +cluster = ARGV[0] +jobid = ARGV[1] + +qualifiedId = QualifiedJobId.new(cluster, jobid) + +conf = HBaseConfiguration.create() +service = JobHistoryRawService.new(conf) + + +if options[:type] == :conf + rawConf = service.getRawJobConfiguration(qualifiedId) + if rawConf.nil? + puts "No job configuration found for #{qualifiedId}" + exit 1 + end + rawConf.writeXml(System.out) +elsif options[:type] == :history + rawHistory = service.getRawJobHistory(qualifiedId) + if rawHistory.nil? + puts "No job history found for #{qualifiedId}" + exit 1 + end + puts rawHistory +end + +service.close() diff --git a/bin/hraven b/bin/hraven new file mode 100755 index 0000000..838fd3f --- /dev/null +++ b/bin/hraven @@ -0,0 +1,194 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Sets up hraven classpath to run arbitrary classes/commands +# +# Environment variables: +# +# HRAVEN_HOME - Directory from which to run hraven. Defaults to parent directory of this script. +# +# HRAVEN_CONF_DIR - Alternate configuration directory. Default is ${HRAVEN_HOME}/conf. +# +# HRAVEN_CLASSPATH - Used to pick up the HBase jars. This must be set. +# For example: +# export HRAVEN_CLASSPATH=`hbase classpath` +# Or if you want to pick up the HBase version from a different cluster: +# export HRAVEN_CLASSPATH=`hbase --config /etc/hbase/conf-hbase-dc1 classpath` +# +# HRAVEN_HEAPSIZE - The maximum amount of heap to use, in MB. +# Default is 1000. +# +# HBASE_CONF_DIR - Alternate directory from which to pick up hbase configurations. Default is ${HBASE_HOME}/conf. +# All other hbase configurations can be set in the standard hbase manner, or supplied here instead. +# + +# Default setting for heap size +JAVA_HEAP_MAX=-Xmx1000m + +# Determine the bin directory +bin=`dirname "$0"` +bin=`cd "$bin">/dev/null; pwd` + +# Assume bin is within the home dir +HRAVEN_HOME=${HRAVEN_HOME:-$bin/..} +# suppress noisy logging +export HBASE_ROOT_LOGGER=WARN,console + +# Allow running from a local build directory +in_dev_env=false +if [ -d "${HRAVEN_HOME}/target" ]; then + in_dev_env=true +fi + + +print_usage_and_exit() { + echo "Usage: hraven [--config confdir] " + echo "where is one of the following:" + echo + echo "rest - run the REST server" + echo "classpath - output the full hRaven classpath (with dependencies)" + echo "CLASSNAME - run the main method on the class CLASSNAME" + echo "" + echo "For example:" + echo "$0 org.jruby.Main " + exit 1 +} + +#check to see if the conf dir is given as an optional argument +if [ $# -gt 1 ]; then + if [ "--config" = "$1" ]; then + shift + confdir=$1 + shift + HRAVEN_CONF_DIR=$confdir + fi +fi + +# Respect conf dir if set, or else dedault +HRAVEN_CONF_DIR=${HRAVEN_CONF_DIR:-${HRAVEN_HOME}/conf} + +# Check input arguments +if [ $# -eq 0 ]; then + print_usage_and_exit +fi + +# Extract the command +COMMAND=$1 +shift + +# If --config was specified, but not an actual config directory. +if [ "--config" = "$COMMAND" ]; then + print_usage_and_exit +fi + +# Source the hraven-env.sh if present +if [ -f "${HRAVEN_CONF_DIR}/hraven-env.sh" ]; then + . "${HRAVEN_CONF_DIR}/hraven-env.sh" +fi + +# Confirm that the hraven classpath is set. +if [ "$HRAVEN_CLASSPATH" = "" ]; then + echo "HRAVEN_CLASSPATH environment variable is not set." + print_usage_and_exit +fi + +MVN="mvn" +if [ "$MAVEN_HOME" != "" ]; then + MVN=${MAVEN_HOME}/bin/mvn +fi + +add_maven_deps_to_classpath() { + # Need to generate classpath from maven pom. This is costly so generate it + # and cache it. Save the file into our target dir so a mvn clean will get + # clean it up and force us create a new one. + f="${HRAVEN_HOME}/target/cached_classpath.txt" + if [ ! -f "${f}" ] + then + ${MVN} -f "${HRAVEN_HOME}/pom.xml" dependency:build-classpath -Dmdep.outputFile="${f}" &> /dev/null + fi + CLASSPATH=${CLASSPATH}:`cat "${f}"` +} + +add_maven_main_classes_to_classpath() { + if [ -d "$HRAVEN_HOME/hraven-core/target/classes" ]; then + CLASSPATH=${CLASSPATH}:$HRAVEN_HOME/hraven-core/target/classes + fi + if [ -d "$HRAVEN_HOME/hraven-etl/target/classes" ]; then + CLASSPATH=${CLASSPATH}:$HRAVEN_HOME/hraven-etl/target/classes + fi +} + +add_maven_test_classes_to_classpath() { + # For developers, add hbase classes to CLASSPATH + f="$HRAVEN_HOME/hraven-core/target/test-classes" + if [ -d "${f}" ]; then + CLASSPATH=${CLASSPATH}:${f} + fi +} + +add_dist_classpath() { + # Add libs to CLASSPATH + for f in $HRAVEN_HOME/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done +} + +# respect HBASE_CONF_DIR if set +CLASSPATH="${HRAVEN_CONF_DIR}:${HBASE_CONF_DIR}" + +# Add maven target directory +if $in_dev_env; then + add_maven_deps_to_classpath + add_maven_main_classes_to_classpath + add_maven_test_classes_to_classpath +else + add_dist_classpath +fi + +# Add user-specified CLASSPATH last +if [ "$HRAVEN_CLASSPATH" != "" ]; then + CLASSPATH=${CLASSPATH}:${HRAVEN_CLASSPATH} +fi + +# Honor java home if set. +if [ "$JAVA_HOME" = "" ]; then + JAVA=java +else + JAVA=$JAVA_HOME/bin/java +fi + +# check envvars which might override default args +if [ "$HRAVEN_HEAPSIZE" != "" ]; then + JAVA_HEAP_MAX="-Xmx""$HRAVEN_HEAPSIZE""m" +fi + +HRAVEN_OPTS="$HRAVEN_OPTS -Dhraven.log.dir=$HRAVEN_LOG_DIR" +HRAVEN_OPTS="$HRAVEN_OPTS -Dhraven.log.file=$HRAVEN_LOGFILE" +HRAVEN_OPTS="$HRAVEN_OPTS -Dhraven.root.logger=${HRAVEN_ROOT_LOGGER:-WARN,console}" + +if [ "$COMMAND" = "rest" ]; then + CLASS="com.twitter.hraven.rest.RestServer" +elif [ "$COMMAND" = "classpath" ]; then + echo $CLASSPATH + exit 0 +else + CLASS="$COMMAND" +fi + +# Do-it +exec $JAVA $JAVA_HEAP_MAX $HRAVEN_OPTS -classpath "$CLASSPATH" $CLASS "$@" diff --git a/bin/hraven-daemon.sh b/bin/hraven-daemon.sh new file mode 100755 index 0000000..2875fd9 --- /dev/null +++ b/bin/hraven-daemon.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Runs a command with the bin/hraven script as a daemon (background) +# +# Environment variables: +# +# HRAVEN_HOME - Directory from which to run hraven. Defaults to parent directory of this script. +# +# HRAVEN_CONF_DIR - Alternate configuration directory. Default is ${HRAVEN_HOME}/conf. +# +# HRAVEN_LOG_DIR - Directory where daemon logs should be written +# +# HRAVEN_PID_DIR - Directory where daemon process ID will be written +# + +# Default setting for heap size +JAVA_HEAP_MAX=-Xmx1000m + +# Determine the bin directory +bin=`dirname "$0"` +bin=`cd "$bin">/dev/null; pwd` + +# Assume bin is within the home dir +HRAVEN_HOME=${HRAVEN_HOME:-$bin/..} +# suppress noisy logging +export HBASE_ROOT_LOGGER=WARN,console + +print_usage_and_exit() { + echo "Usage: hraven-daemon.sh [--config confdir] (start|stop|restart) " + echo "where is one of the following:" + echo + echo "rest - run the REST server" + echo "CLASSNAME - run the main method on the class CLASSNAME" + echo "" + exit 1 +} + + +# if no args specified, show usage +if [ $# -le 1 ]; then + print_usage_and_exit +fi + +#check to see if the conf dir is given as an optional argument +if [ $# -gt 1 ]; then + if [ "--config" = "$1" ]; then + shift + confdir=$1 + shift + HRAVEN_CONF_DIR=$confdir + fi +fi + +# Respect conf dir if set, or else dedault +HRAVEN_CONF_DIR=${HRAVEN_CONF_DIR:-${HRAVEN_HOME}/conf} + +# get arguments +startStop=$1 +shift + +command=$1 +shift + +rotate_log () +{ + log=$1; + num=5; + if [ -n "$2" ]; then + num=$2 + fi + if [ -f "$log" ]; then # rotate logs + while [ $num -gt 1 ]; do + prev=`expr $num - 1` + [ -f "$log.$prev" ] && mv -f "$log.$prev" "$log.$num" + num=$prev + done + mv -f "$log" "$log.$num"; + fi +} + +wait_until_done () +{ + p=$1 + cnt=${HRAVEN_SLAVE_TIMEOUT:-300} + origcnt=$cnt + while kill -0 $p > /dev/null 2>&1; do + if [ $cnt -gt 1 ]; then + cnt=`expr $cnt - 1` + sleep 1 + else + echo "Process did not complete after $origcnt seconds, killing." + kill -9 $p + exit 1 + fi + done + return 0 +} + +# setup logging details +export HRAVEN_LOG_DIR=${HRAVEN_LOG_DIR:-$HRAVEN_HOME/logs} +export HRAVEN_PID_DIR=${HRAVEN_PID_DIR:-/tmp} +HRAVEN_IDENT_STRING=${HRAVEN_IDENT_STRING:-$USER} +logprefix=hraven-${HRAVEN_IDENT_STRING}-${command}-$HOSTNAME + +export HRAVEN_LOGFILE=$logprefix.log +export HRAVEN_ROOT_LOGGER="WARN,DRFA" +loglog="${HRAVEN_LOG_DIR}/${HRAVEN_LOGFILE}" +logout=$HRAVEN_LOG_DIR/$logprefix.out +pidfile=$HRAVEN_PID_DIR/$logprefix.pid + + +case $startStop in + + (start) + mkdir -p "$HRAVEN_PID_DIR" + if [ -f $pidfile ]; then + if kill -0 `cat $pidfile` > /dev/null 2>&1; then + echo $command running as process `cat $pidfile`. Stop it first. + exit 1 + fi + fi + + rotate_log $logout + echo starting $command, logging to $logout + # Add to the command log file vital stats on our environment. + echo "`date` Starting $command on `hostname`" >> $loglog + nohup "$HRAVEN_HOME"/bin/hraven \ + --config "${HRAVEN_CONF_DIR}" \ + $command "$@" $startStop > "$logout" 2>&1 < /dev/null & + echo $! > $pidfile + sleep 1; head "$logout" + ;; + + (stop) + if [ -f $pidfile ]; then + # kill -0 == see if the PID exists + if kill -0 `cat $pidfile` > /dev/null 2>&1; then + echo -n stopping $command + echo "`date` Terminating $command" >> $loglog + kill `cat $pidfile` > /dev/null 2>&1 + while kill -0 `cat $pidfile` > /dev/null 2>&1; do + echo -n "." + sleep 1; + done + rm $pidfile + echo + else + retval=$? + echo no $command to stop because kill -0 of pid `cat $pidfile` failed with status $retval + fi + else + echo no $command to stop because no pid file $pidfile + fi + ;; + + (restart) + thiscmd=$0 + args=$@ + # stop the command + $thiscmd --config "${HRAVEN_CONF_DIR}" stop $command $args & + wait_until_done $! + # wait a user-specified sleep period + sp=${HRAVEN_RESTART_SLEEP:-3} + if [ $sp -gt 0 ]; then + sleep $sp + fi + # start the command + $thiscmd --config "${HRAVEN_CONF_DIR}" start $command $args & + wait_until_done $! + ;; + + (*) + echo $usage + exit 1 + ;; + +esac diff --git a/bin/job_level_statistics.rb b/bin/job_level_statistics.rb new file mode 100755 index 0000000..211fc0c --- /dev/null +++ b/bin/job_level_statistics.rb @@ -0,0 +1,225 @@ +#!./hraven org.jruby.Main +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Groups data in job_history table by either user or user&app and sorts data by different columns. +# For example you can find out the user/app which had most tasks/mappers, wrote most to HDFS... +# Data can be filtered by time (start and end), framework and job status. +# Data can be sorted by one column or by key. +# hbase --config /etc/hbase/conf-hbase org.jruby.Main job_level_statistics.rb -S 0 -c "total_maps total_reduces" -s 1342000000000 -e 1343000000000 -j "SUCCESS" -l 100 +# +require 'optparse' +include Java + +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.HTable +import org.apache.hadoop.hbase.client.Scan +import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter +import org.apache.hadoop.hbase.filter.FilterList +import org.apache.hadoop.hbase.filter.CompareFilter +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter +import org.apache.hadoop.hbase.util.Bytes + +import com.twitter.hraven.Constants +import com.twitter.hraven.datasource.JobKeyConverter + +options = {} +options[:cols] = ["total_maps", "total_reduces"] +options[:limit] = -1 +options[:apps] = false +options[:sort] = nil +options[:framework] = nil +options[:jobstatus] = nil +OptionParser.new do |opts| + opts.banner = "Usage: ./job_level_statistics.rb [options]" + + opts.on("-c", "--cols N", String, "Select the columns to do statistics on.") do |t| + options[:cols] = t.split(" ") + end + opts.on("-s", "--start START", Integer, "Minumum timestamp (ms) to filter on") do |o| + options[:start] = o + end + opts.on("-e", "--end END", Integer, "Maximum timestamp (ms) to filter on") do |o| + options[:end] = o + end + opts.on("-l", "--limit N", Integer, "Max number of results to return") do |o| + options[:limit] = o + end + opts.on("-a", "--apps", "Get information at application level. The default is user level.") do + options[:apps] = true + end + opts.on("-S", "--Sort N", Integer, + "Sort info on this column(0 based). + Defaults in no sorting (basicaly sorting on user/app). ") do |o| + options[:sort] = o + end + opts.on("-f", "--framework FRAMEWORK", String, + "The MR framework to filter on (PIG|SCALDING|NONE)") do |o| + options[:framework] = o + end + opts.on("-j", "--jobstatus J", String, "Filter by job_status (SUCCESS|KILLED|FAILED).") do |o| + options[:jobstatus] = o + end + opts.on_tail("-h", "--help", "Show this message") do + puts opts + exit + end + +end.parse! + +class DataInfo + attr_accessor :data, :counters, :count + def initialize ( data, counters, count) + @data = data + @counters = Array.new(counters.size) + counters.each_with_index { |val, index| + @counters[index] = val + } + @count = count + end +end + +date_descriptor = "submit_time" + +colnames = options[:cols].join("\t") +colbytes = options[:cols].collect {|x| Bytes.toBytes(x)} +scan = Scan.new +colbytes.each { |x| + scan.addColumn(Constants::INFO_FAM_BYTES, x) +} +scan.addColumn(Constants::INFO_FAM_BYTES, Constants::FRAMEWORK_COLUMN_BYTES) +scan.addColumn(Constants::INFO_FAM_BYTES, Bytes.toBytes(date_descriptor)) +scan.addColumn(Constants::INFO_FAM_BYTES, Bytes.toBytes("job_status")) + +config = HBaseConfiguration.create() +historyTable = HTable.new(config, Constants.HISTORY_TABLE_BYTES) + +# Filtering +filters = [] + +if options[:framework] then + frameworkFilter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, + Constants::FRAMEWORK_COLUMN_BYTES, + CompareFilter::CompareOp::EQUAL, + Bytes.toBytes(options[:framework])) + frameworkFilter.setFilterIfMissing(true) + filters << frameworkFilter +end + +if options[:jobstatus] then + jobstatusFilter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, + Bytes.toBytes("job_status"), + CompareFilter::CompareOp::EQUAL, + Bytes.toBytes(options[:jobstatus])) + filters << jobstatusFilter +end + +if options[:start] + STDERR.write "Filtering where #{date_descriptor} >= #{options[:start]}\n" + filter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, + Bytes.toBytes(date_descriptor), + CompareFilter::CompareOp::GREATER_OR_EQUAL, + Bytes.toBytes(options[:start])) + filter.setFilterIfMissing(true) + filters << filter +end +if options[:end] + STDERR.write "Filtering where #{date_descriptor} < #{options[:end]}\n" + filter = SingleColumnValueFilter.new(Constants::INFO_FAM_BYTES, + Bytes.toBytes(date_descriptor), + CompareFilter::CompareOp::LESS, + Bytes.toBytes(options[:end])) + filter.setFilterIfMissing(true) + filters << filter +end + +filterList = FilterList.new(filters) +scan.setFilter(filterList) +scanner = historyTable.getScanner(scan) + +keyConv = JobKeyConverter.new + +lastkey = nil +info = [] # Used for storing sums for user/app and then for sorting. +sums = Array.new(colbytes.size ,0) # Keeps the sum for a user/app for each of the columns. +count = 0 # Keeps the number of jobs for each app. + +scanner.each_with_index { |result, rowcnt| # Grouping rows by user/app and calculating the sums. + break if result.nil? || result.isEmpty + key = keyConv.fromBytes(result.getRow()) + keyfields = "#{key.getUserName()}" # user level data + if options[:apps] == true then + keyfields = "#{key.getUserName()}\t#{key.getAppId()}" #app level data + end + + if keyfields == lastkey then # same user/app so we just add to the sums. + # For each user defined column we add the value to the sums for this user/app. + colbytes.each_with_index { |x, poz| + val = result.getValue(Constants::INFO_FAM_BYTES, x) + next if val == nil + sums[poz] = sums[poz] + Bytes.toLong(val) + } + count = count + 1 + next + end + # If the last user/app had any valid data add them to the data. + info.push(DataInfo.new(lastkey, sums, count)) if count != 0 + count = 0 # Then we reset everything to 0. + sums.each_with_index{ |x, poz| + sums[poz] = 0 + } + + colbytes.each_with_index { |x, poz| + val = result.getValue(Constants::INFO_FAM_BYTES, x) + next if val == nil + sums[poz] = sums[poz] + Bytes.toLong(val) + } + count = count + 1 + lastkey = keyfields + +} + +if count != 0 then + info.push(DataInfo.new(lastkey, sums, count)) +end + +#Sorting the data. +if options[:sort] then + #By a single column. + tempinfo = info.sort_by{ |a| + -a.counters[options[:sort]] + } + info =tempinfo +end + +limit = options[:limit] || -1 +STDERR.write "Limiting number of rows returned to #{limit}\n" if limit > -1 + +#Printing out the data +print "User/app " +options[:cols].each { |col| + print col.to_s + " " + } +puts "count" +info.each_with_index { |job, count| + break if count >= limit + print "#{job.data} : " + job.counters.each{ |counter| + print counter.to_s + " " + } + puts job.count.to_s +} diff --git a/conf/hraven-env.sh b/conf/hraven-env.sh new file mode 100755 index 0000000..39ac11f --- /dev/null +++ b/conf/hraven-env.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2013 Twitter, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Used to configure hRaven environment + +# JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +# export JAVA_HOME= + +# HBASE_CONF_DIR Alternate directory from which to pick up hbase configurations. Default is ${HBASE_HOME}/conf. +# All other hbase configurations can be set in the standard hbase manner, or supplied here instead. +# export HBASE_CONF_DIR= + +# HADOOP_CONF_DIR Alternate directory from which to pick up hadoop configurations. Default is ${HADOOP_HOME}/conf. +# All other hadoop configurations can be set in the standard hadoop manner, or supplied here instead. +# export HADOOP_CONF_DIR= + +# export HRAVEN_CLASSPATH=`hbase classpath` +# export HRAVEN_CLASSPATH=`hbase --config /etc/hbase/conf-hbase-tst-dc1 classpath` +export HRAVEN_CLASSPATH= + +# The maximum amount of heap to use, in MB. Default is 1000. +# export HRAVEN_HEAPSIZE=1000 + +# Location for process ID files for any hRaven daemons +export HRAVEN_PID_DIR=/tmp/ \ No newline at end of file diff --git a/conf/log4j.properties b/conf/log4j.properties new file mode 100644 index 0000000..4215a33 --- /dev/null +++ b/conf/log4j.properties @@ -0,0 +1,40 @@ +# Define some default values that can be overridden by system properties +# from other start scripts +hraven.root.logger=WARN,console +hraven.log.dir=. +hraven.log.file=hraven.log + +# Define the root logger to the system property "hraven.root.logger". +log4j.rootLogger=${hraven.root.logger} + +# Logging Threshold +log4j.threshold=ALL + +# +# Daily Rolling File Appender +# +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hraven.log.dir}/${hraven.log.file} + +# Rollver at midnight +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# console +# Add "console" to rootlogger above if you want to use this +# +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# Custom Logging levels + +log4j.logger.com.twitter.hraven=INFO diff --git a/dev-support/hraven_eclipse_formatter.xml b/dev-support/hraven_eclipse_formatter.xml new file mode 100644 index 0000000..1a5d719 --- /dev/null +++ b/dev-support/hraven_eclipse_formatter.xml @@ -0,0 +1,310 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/hraven-assembly/pom.xml b/hraven-assembly/pom.xml new file mode 100644 index 0000000..07222a3 --- /dev/null +++ b/hraven-assembly/pom.xml @@ -0,0 +1,52 @@ + + + + 4.0.0 + + com.twitter.hraven + hraven + 0.9.0-SNAPSHOT + ../ + + com.twitter.hraven + hraven-assembly + 0.9.0-SNAPSHOT + hRaven-assembly + hRaven - Assembly artifacts + pom + + + + + maven-assembly-plugin + 2.3 + + hraven-${project.version} + false + gnu + true + false + -Xmx1024m + + src/main/assembly/all.xml + + + + + + \ No newline at end of file diff --git a/hraven-assembly/src/main/assembly/all.xml b/hraven-assembly/src/main/assembly/all.xml new file mode 100644 index 0000000..2a4f5a3 --- /dev/null +++ b/hraven-assembly/src/main/assembly/all.xml @@ -0,0 +1,79 @@ + + + + + all + + tar.gz + + + + + ${project.basedir}/.. + . + + *.txt + + 0644 + + + + ${project.basedir}/../conf + conf + 0644 + 0755 + + + + ${project.basedir}/../bin + bin + 0755 + 0755 + + + + ${project.basedir}/../hraven-core/src/main/scripts + scripts + 0644 + 0755 + + + + + true + + lib + false + + + runtime + 0644 + 0644 + + + + + + diff --git a/hraven-core/.settings/org.eclipse.jdt.core.prefs b/hraven-core/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..656eb0d --- /dev/null +++ b/hraven-core/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +#Thu May 16 09:55:03 PDT 2013 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.source=1.6 +org.eclipse.jdt.core.compiler.compliance=1.6 diff --git a/hraven-core/pom.xml b/hraven-core/pom.xml new file mode 100644 index 0000000..ebcb32e --- /dev/null +++ b/hraven-core/pom.xml @@ -0,0 +1,409 @@ + + + + + + 4.0.0 + + com.twitter.hraven + hraven + 0.9.0-SNAPSHOT + ../ + + com.twitter.hraven + hraven-core + 0.9.0-SNAPSHOT + hRaven - core + Core components of hRaven, including model classes and data access layer + + + 6.1.26 + + + + + + + + maven-compiler-plugin + 2.1 + + ${compileSource} + ${compileSource} + true + false + -Xmx1024m + + + + maven-source-plugin + 2.1.1 + + + maven-surefire-plugin + 2.5 + + 3600 + -Xmx512m + true + + + + maven-javadoc-plugin + 2.6.1 + + + maven-clean-plugin + 2.4 + + + + + build + + + + + + maven-dependency-plugin + 2.4 + + + copy-dependencies + package + + copy-dependencies + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.2 + + + + test-jar + + + + + + + + + + src/test/resources/ + + log4j.properties + done/* + + + + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + package + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + always + + **/Test*.java + + + **/*$* + + + + + + maven-assembly-plugin + 2.3 + + true + + + + + + + + com.google.guava + guava + ${guava.version} + + + commons-logging + commons-logging + ${commons-logging.version} + + + commons-cli + commons-cli + 1.2 + + + org.apache.commons + commons-lang3 + 3.1 + + + log4j + log4j + ${log4j.version} + + + javax.jms + jms + + + javax.mail + mail + + + com.sun.jmx + jmxri + + + com.sun.jdmk + jmxtools + + + + + + org.apache.hadoop + hadoop-core + ${hadoop.version} + provided + + + hsqldb + hsqldb + + + net.sf.kosmosfs + kfs + + + org.eclipse.jdt + core + + + net.java.dev.jets3t + jets3t + + + oro + oro + + + + + org.apache.hbase + hbase + ${hbase.version} + provided + + + org.apache.thrift + thrift + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + org.mortbay.jetty + jsp-2.1 + + + org.mortbay.jetty + jsp-api-2.1 + + + org.mortbay.jetty + servlet-api-2.5 + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + + + com.sun.jersey + jersey-servlet + ${jersey.version} + + + com.sun.jersey + jersey-json + ${jersey.version} + + + com.sun.jersey + jersey-server + ${jersey.version} + + + com.sun.jersey + jersey-core + ${jersey.version} + + + org.codehaus.jackson + jackson-core-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-mapper-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-jaxrs + ${jackson.version} + + + org.codehaus.jackson + jackson-xc + ${jackson.version} + + + org.mortbay.jetty + jetty + ${jetty.version} + + + org.mortbay.jetty + jetty-util + ${jetty.version} + + + + + org.apache.hadoop + hadoop-test + ${hadoop.version} + test + + + + org.apache.hbase + hbase + ${hbase.version} + tests + test + + + org.apache.thrift + thrift + + + + + junit + junit + ${junit.version} + test + + + org.mockito + mockito-all + ${mockito-all.version} + test + + + + + + os.linux + + false + + Linux + + + + ${os.name}-${os.arch}-${sun.arch.data.model} + + + + os.mac + + + Mac + + + + Mac_OS_X-${sun.arch.data.model} + + + + + + + + maven-javadoc-plugin + 2.6.1 + + true + + + + default + + javadoc + + + + + + + diff --git a/hraven-core/src/main/java/com/twitter/hraven/ClientObjectMapper.java b/hraven-core/src/main/java/com/twitter/hraven/ClientObjectMapper.java new file mode 100644 index 0000000..c18e05c --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/ClientObjectMapper.java @@ -0,0 +1,108 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.codehaus.jackson.JsonParser; +import org.codehaus.jackson.JsonToken; +import org.codehaus.jackson.map.DeserializationContext; +import org.codehaus.jackson.map.JsonDeserializer; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.map.deser.CustomDeserializerFactory; +import org.codehaus.jackson.map.deser.StdDeserializerProvider; + +/** + * Custom Jackson ObjectMapper factory that knows how to deserialize json back into objects. This + * class lives in the same package of the object model so we can add setter methods and constructors + * as package-private as needed to the object model. + */ +// This is used in JSONUtil and the comment above not-withstanding we probably need to (re)move this +// from the top-level package. +@Deprecated +public class ClientObjectMapper { + + public static ObjectMapper createCustomMapper() { + ObjectMapper result = new ObjectMapper(); + CustomDeserializerFactory deserializerFactory = new CustomDeserializerFactory(); + + deserializerFactory.addSpecificMapping(Configuration.class, new ConfigurationDeserializer()); + deserializerFactory.addSpecificMapping(CounterMap.class, new CounterDeserializer()); + + result.setDeserializerProvider(new StdDeserializerProvider(deserializerFactory)); + return result; + } + + /** + * Custom serializer for Configuration object. We don't want to serialize the classLoader. + */ + private static class ConfigurationDeserializer extends JsonDeserializer { + + @Override + public Configuration deserialize(JsonParser jsonParser, + DeserializationContext deserializationContext) + throws IOException { + Configuration conf = new Configuration(); + + JsonToken token; + while ((token = jsonParser.nextToken()) != JsonToken.END_OBJECT) { + if (token != JsonToken.VALUE_STRING) { continue; } // all deserialized values are strings + conf.set(jsonParser.getCurrentName(), jsonParser.getText()); + } + + return conf; + } + } + + /** + * Custom serializer for Configuration object. We don't want to serialize the classLoader. + */ + private static class CounterDeserializer extends JsonDeserializer { + + @Override + public CounterMap deserialize(JsonParser jsonParser, + DeserializationContext deserializationContext) + throws IOException { + CounterMap counterMap = new CounterMap(); + + JsonToken token; + while ((token = jsonParser.nextToken()) != JsonToken.END_OBJECT) { + assertToken(token, JsonToken.FIELD_NAME); + String group = jsonParser.getCurrentName(); + + assertToken(jsonParser.nextToken(), JsonToken.START_OBJECT); + while ((token = jsonParser.nextToken()) != JsonToken.END_OBJECT) { + if (token != JsonToken.VALUE_NUMBER_INT) { + continue; // all deserialized values are ints + } + + Counter counter = + new Counter(group, jsonParser.getCurrentName(), jsonParser.getLongValue()); + counterMap.add(counter); + } + } + return counterMap; + } + } + + private static void assertToken(JsonToken found, JsonToken expected) { + if (expected != found) { + throw new IllegalStateException("Expecting JsonToken to be " + expected.asString() + + ", but found JsonToken=" + found.asString()); + } + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Cluster.java b/hraven-core/src/main/java/com/twitter/hraven/Cluster.java new file mode 100644 index 0000000..efeee82 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Cluster.java @@ -0,0 +1,50 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +public class Cluster { + private static Map CLUSTERS_BY_HOST = new HashMap(); + + public static String getIdentifier(String hostname) { + return CLUSTERS_BY_HOST.get(hostname); + } + + static { + // read the property file + // populate the map + Properties prop = new Properties(); + try { + //TODO : property file to be moved out from resources into config dir + prop.load(Cluster.class.getResourceAsStream("/hadoopclusters.properties")); + Set hostnames = prop.stringPropertyNames(); + for (String h : hostnames) { + CLUSTERS_BY_HOST.put(h, prop.getProperty(h)); + } + } catch (IOException e) { + // An ExceptionInInitializerError will be thrown to indicate that an + // exception occurred during evaluation of a static initializer or the + // initializer for a static variable. + throw new ExceptionInInitializerError( + " Could not load properties file hadoopclusters.properties "); + } + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Constants.java b/hraven-core/src/main/java/com/twitter/hraven/Constants.java new file mode 100644 index 0000000..15018be --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Constants.java @@ -0,0 +1,289 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.text.SimpleDateFormat; +import java.util.TimeZone; +import org.apache.hadoop.hbase.util.Bytes; + +/** + */ +public class Constants { + + public static final String PROJECT_NAME = "hraven"; + + // HBase constants + + // separator character used between key components + public static final char SEP_CHAR = '!'; + public static final String SEP = "" + SEP_CHAR; + public static final byte[] SEP_BYTES = Bytes.toBytes(SEP); + + // common default values + public static final byte[] EMPTY_BYTES = new byte[0]; + public static final byte[] ZERO_INT_BYTES = Bytes.toBytes(0); + public static final byte[] ZERO_LONG_BYTES = Bytes.toBytes(0L); + public static final byte[] ZERO_SINGLE_BYTE = new byte[]{ 0 }; + + public static final String UNKNOWN = ""; + + public static boolean IS_DEV = false; + public static String PREFIX = IS_DEV ? "dev." : ""; + + /* **** Table names **** */ + public static String HISTORY_TABLE = PREFIX + "job_history"; + public static byte[] HISTORY_TABLE_BYTES = Bytes.toBytes(HISTORY_TABLE); + + public static String HISTORY_TASK_TABLE = HISTORY_TABLE + "_task"; + public static byte[] HISTORY_TASK_TABLE_BYTES = Bytes + .toBytes(HISTORY_TASK_TABLE); + + public static String HISTORY_BY_JOBID_TABLE = HISTORY_TABLE + "-by_jobId"; + public static byte[] HISTORY_BY_JOBID_TABLE_BYTES = Bytes + .toBytes(HISTORY_BY_JOBID_TABLE); + + public static String HISTORY_APP_VERSION_TABLE = HISTORY_TABLE + + "_app_version"; + public static byte[] HISTORY_APP_VERSION_TABLE_BYTES = Bytes + .toBytes(HISTORY_APP_VERSION_TABLE); + + public static String HISTORY_RAW_TABLE = HISTORY_TABLE + "_raw"; + public static byte[] HISTORY_RAW_TABLE_BYTES = Bytes + .toBytes(HISTORY_RAW_TABLE); + + public static final String JOB_FILE_PROCESS_TABLE = PREFIX + + "job_history_process"; + public static final byte[] JOB_FILE_PROCESS_TABLE_BYTES = Bytes + .toBytes(JOB_FILE_PROCESS_TABLE); + + public static final String FLOW_QUEUE_TABLE = PREFIX + "flow_queue"; + public static final byte[] FLOW_QUEUE_TABLE_BYTES = Bytes.toBytes(FLOW_QUEUE_TABLE); + + public static final String FLOW_EVENT_TABLE = PREFIX + "flow_event"; + public static final byte[] FLOW_EVENT_TABLE_BYTES = Bytes.toBytes(FLOW_EVENT_TABLE); + + public static final String INFO_FAM = "i"; + public static final byte[] INFO_FAM_BYTES = Bytes.toBytes(INFO_FAM); + + public static final String RAW_FAM = "r"; + public static final byte[] RAW_FAM_BYTES = Bytes.toBytes(RAW_FAM); + + /** Column qualifier prefix to namespace job configuration properties */ + public static final String JOB_CONF_COLUMN_PREFIX = "c"; + public static byte[] JOB_CONF_COLUMN_PREFIX_BYTES = Bytes + .toBytes(JOB_CONF_COLUMN_PREFIX); + + /** Column qualifier prefix to namespace counter data */ + public static final String COUNTER_COLUMN_PREFIX = "g"; + public static final byte[] COUNTER_COLUMN_PREFIX_BYTES = Bytes + .toBytes(COUNTER_COLUMN_PREFIX); + + /** Column qualifier prefix to namespace map-specific counter data */ + public static final String MAP_COUNTER_COLUMN_PREFIX = "gm"; + public static final byte[] MAP_COUNTER_COLUMN_PREFIX_BYTES = Bytes + .toBytes(MAP_COUNTER_COLUMN_PREFIX); + + /** Column qualifier prefix to namespace reduce-specific counter data */ + public static final String REDUCE_COUNTER_COLUMN_PREFIX = "gr"; + public static final byte[] REDUCE_COUNTER_COLUMN_PREFIX_BYTES = Bytes + .toBytes(REDUCE_COUNTER_COLUMN_PREFIX); + + public static final String JOBCONF_COL = "jobconf"; + public static final byte[] JOBCONF_COL_BYTES = Bytes.toBytes(JOBCONF_COL); + + public static final String JOBCONF_LAST_MODIFIED_COL = JOBCONF_COL + + "_last_modified"; + public static final byte[] JOBCONF_LAST_MODIFIED_COL_BYTES = Bytes + .toBytes(JOBCONF_LAST_MODIFIED_COL); + + public static final String JOBCONF_FILENAME_COL = JOBCONF_COL + "_filename"; + public static final byte[] JOBCONF_FILENAME_COL_BYTES = Bytes + .toBytes(JOBCONF_FILENAME_COL); + + public static final String JOBHISTORY_COL = "jobhistory"; + public static final byte[] JOBHISTORY_COL_BYTES = Bytes + .toBytes(JOBHISTORY_COL); + + public static final String JOBHISTORY_LAST_MODIFIED_COL = JOBHISTORY_COL + + "_last_modified"; + public static final byte[] JOBHISTORY_LAST_MODIFIED_COL_BYTES = Bytes + .toBytes(JOBHISTORY_LAST_MODIFIED_COL); + + public static final String JOBHISTORY_FILENAME_COL = JOBHISTORY_COL + + "_filename"; + public static final byte[] JOBHISTORY_FILENAME_COL_BYTES = Bytes + .toBytes(JOBHISTORY_FILENAME_COL); + + /** Column qualifer used to flag job_history_raw records for reprocessing */ + public static final String RAW_COL_REPROCESS = "reprocess"; + public static final byte[] RAW_COL_REPROCESS_BYTES = Bytes.toBytes(RAW_COL_REPROCESS); + + public static final String SUBMIT_TIME_COL = "submit_time"; + public static final byte[] SUBMIT_TIME_COL_BYTES = Bytes.toBytes(SUBMIT_TIME_COL); + + public static final String ROWKEY_COL = "rowkey"; + public static final byte[] ROWKEY_COL_BYTES = Bytes.toBytes(ROWKEY_COL); + + public static final String RECORD_TYPE_COL = "rec_type"; + public static final byte[] RECORD_TYPE_COL_BYTES = Bytes + .toBytes(RECORD_TYPE_COL); + + public static final String MIN_MOD_TIME_MILLIS_COLUMN = "min_mod_millis"; + public static final byte[] MIN_MOD_TIME_MILLIS_COLUMN_BYTES = Bytes + .toBytes(MIN_MOD_TIME_MILLIS_COLUMN); + + public static final String PROCESSED_JOB_FILES_COLUMN = "processed_job_files"; + public static final byte[] PROCESSED_JOB_FILES_COLUMN_BYTES = Bytes + .toBytes(PROCESSED_JOB_FILES_COLUMN); + + public static final String PROCESS_FILE_COLUMN = "processing_directory"; + public static final byte[] PROCESS_FILE_COLUMN_BYTES = Bytes + .toBytes(PROCESS_FILE_COLUMN); + + public static final String PROCESSING_STATE_COLUMN = "processing_state"; + public static final byte[] PROCESSING_STATE_COLUMN_BYTES = Bytes + .toBytes(PROCESSING_STATE_COLUMN); + + public static final String VERSION_COLUMN = "version"; + public static final byte[] VERSION_COLUMN_BYTES = Bytes + .toBytes(VERSION_COLUMN); + + public static final String FRAMEWORK_COLUMN = "framework"; + public static final byte[] FRAMEWORK_COLUMN_BYTES = Bytes + .toBytes(FRAMEWORK_COLUMN); + + public static final String MIN_JOB_ID_COLUMN = "min_jobid"; + public static final byte[] MIN_JOB_ID_COLUMN_BYTES = Bytes + .toBytes(MIN_JOB_ID_COLUMN); + + public static final String MAX_JOB_ID_COLUMN = "max_jobid"; + public static final byte[] MAX_JOB_ID_COLUMN_BYTES = Bytes + .toBytes(MAX_JOB_ID_COLUMN); + + // JobHistory related + public static final String JOB_CONF_FILE_END = "conf.xml"; + + // job details related counter stats + public static final String FILESYSTEM_COUNTERS = "FileSystemCounters"; + public static final String FILES_BYTES_READ = "FILE_BYTES_READ"; + public static final String FILES_BYTES_WRITTEN = "FILE_BYTES_WRITTEN"; + public static final String HDFS_BYTES_READ = "HDFS_BYTES_READ"; + public static final String HDFS_BYTES_WRITTEN = "HDFS_BYTES_WRITTEN"; + /* TODO: update for 2.0.3+, this class is now deprecated */ + public static final String JOBINPROGRESS_COUNTER = "org.apache.hadoop.mapred.JobInProgress$Counter"; + /* TODO: update for 2.0.3+, this class is now deprecated */ + public static final String TASK_COUNTER = "org.apache.hadoop.mapred.Task$Counter"; + public static final String SLOTS_MILLIS_MAPS = "SLOTS_MILLIS_MAPS"; + public static final String SLOTS_MILLIS_REDUCES = "SLOTS_MILLIS_REDUCES"; + public static final String REDUCE_SHUFFLE_BYTES = "REDUCE_SHUFFLE_BYTES"; + + /** + * Indicator whether a job has been processed successfully from the RAW table + * to the history and index tables. Used to skip this job from the RAW table + * for the next set of jobs to process. + */ + public static final String JOB_PROCESSED_SUCCESS_COL = "job_processed_success"; + public static final byte[] JOB_PROCESSED_SUCCESS_COL_BYTES = Bytes + .toBytes(JOB_PROCESSED_SUCCESS_COL); + + /** + * The string preceding the job submit time in a job history file. + */ + public static final String SUBMIT_TIME_PREFIX = "SUBMIT_TIME=\""; + + /** + * Raw bytes representation of {@link #SUBMIT_TIME_PREFIX}; + */ + public static final byte[] SUBMIT_TIME_PREFIX_BYTES = Bytes + .toBytes(SUBMIT_TIME_PREFIX); + + public static final String QUOTE = "\""; + public static final byte[] QUOTE_BYTES = Bytes.toBytes(QUOTE); + + /** + * The maximum length of a radix 10 string representing a long. + */ + public static final int MAX_LONG_LENGTH = Long.toString(Long.MAX_VALUE) + .length(); + + public static final String USER_CONF_KEY = "user.name"; + public static final String JOB_NAME_CONF_KEY = "mapred.job.name"; + + public static final String PIG_CONF_KEY = "pig.version"; // used to detect a + // pig job + public static final String APP_NAME_CONF_KEY = "batch.desc"; + + /** + * Added as part of PIG-2587 + */ + public static final String PIG_VERSION_CONF_KEY = "pig.logical.plan.signature"; + public static final String PIG_RUN_CONF_KEY = "pig.script.submitted.timestamp"; + public static final String PIG_LOG_FILE_CONF_KEY = "pig.logfile"; + + public static final String CASCADING_FLOW_ID_CONF_KEY = "cascading.flow.id"; + + public static final String CASCADING_APP_NAME_CONF_KEY = "scalding.flow.class.name"; + public static final String CASCADING_VERSION_CONF_KEY = "scalding.flow.class.signature"; + public static final String CASCADING_RUN_CONF_KEY = "scalding.flow.submitted.timestamp"; + public static final String CASCADING_APP_ID_CONF_KEY = "cascading.app.id"; + + public static final String MR_RUN_CONF_KEY = "mapred.app.submitted.timestamp"; + + /** + * Timestamp format used to create processing directories + */ + public static final SimpleDateFormat TIMESTAMP_FORMAT = new SimpleDateFormat( + "yyyyMMddHHmmss"); + + // Initialize to use UTC + static { + TIMESTAMP_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + /** + * Regex to parse a job file name. First back-ref is JT name, second one if + * job-id + *

+ * For example, + * cluster-jt.identifier.example.com_1333569494142_job_201204041958_150125_conf + * .xml + */ + public static final String JOB_FILENAME_PATTERN_REGEX = "^([^_]*)_[^_]*_(job_[0-9]*_[0-9]*)_(.*)$"; + + /** + * Regex to parse pig logfile name such as + * "/var/log/pig/pig_1334818693838.log" + */ + public static final String PIG_LOGFILE_PATTERN_REGEX = "^.*pig_([0-9]*).log$"; + + /** + * Used for legacy jobs that don't have batch.desc set. Should convert from + * something like this: + * PigLatin:daily_job:daily_2012/06/22-00:00:00_to_2012/06/23-00:00:00 top this: + * PigJobDescFactory.SCHEDULED_PREFIX + "daily_job:daily" + */ + public static final String PIG_SCHEDULED_JOBNAME_PATTERN_REGEX = "^PigLatin:([^:]*(:.*?)+)_([0-9]{4})/.*"; + + /** + * Used to pass the cluster name from the tool to the RecordReader. + */ + public static final String CLUSTER_JOB_CONF_KEY = "jobhistory.cluster"; + + /** + * Used to pass boolean to mappers to indicate that items are to be reprocessed. + */ + public static final String FORCE_REPROCESS_CONF_KEY = "force.reprocess"; +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Counter.java b/hraven-core/src/main/java/com/twitter/hraven/Counter.java new file mode 100644 index 0000000..1bac4c5 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Counter.java @@ -0,0 +1,56 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * Represents a single counter entry, which may be associated with a job, + * task, or task attempt. + */ +public class Counter { + private String group; + private String key; + private long value; + + public Counter(String group, String key, long value) { + this.group = group; + this.key = key; + this.value = value; + } + + public String getGroup() { + return group; + } + + public void setGroup(String group) { + this.group = group; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public long getValue() { + return value; + } + + public void setValue(long value) { + this.value = value; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/CounterMap.java b/hraven-core/src/main/java/com/twitter/hraven/CounterMap.java new file mode 100644 index 0000000..c36be0c --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/CounterMap.java @@ -0,0 +1,55 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.codehaus.jackson.map.annotate.JsonSerialize; + +@JsonSerialize( + include=JsonSerialize.Inclusion.NON_NULL +) +public class CounterMap { + private Map> internalMap = new HashMap>(); + + public Set getGroups() { + return internalMap.keySet(); + } + + public Map getGroup(String group) { + return internalMap.get(group); + } + + public Counter getCounter(String group, String name) { + Map groupCounters = getGroup(group); + if (groupCounters != null) { + return groupCounters.get(name); + } + + return null; + } + + public void add(Counter counter) { + Map groupCounters = internalMap.get(counter.getGroup()); + if (groupCounters == null) { + groupCounters = new HashMap(); + internalMap.put(counter.getGroup(), groupCounters); + } + groupCounters.put(counter.getKey(), counter); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Flow.java b/hraven-core/src/main/java/com/twitter/hraven/Flow.java new file mode 100644 index 0000000..c4bfe91 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Flow.java @@ -0,0 +1,394 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.hbase.util.Bytes; +import org.codehaus.jackson.annotate.JsonCreator; +import org.codehaus.jackson.annotate.JsonProperty; +import org.codehaus.jackson.map.annotate.JsonSerialize; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +/** + * A flow represents a collection of map reduce jobs run together as a data + * processing pipeline. In the case of scalding, this would be a given run of a + * scalding script. In the case of pig, this would be a single run of a pig + * script. + */ +@JsonSerialize( + include=JsonSerialize.Inclusion.NON_NULL + ) +public class Flow implements Comparable { + public enum Status { RUNNING('r'), SUCCEEDED('s'), FAILED('f'); + + byte[] code; + + Status(char code) { + this.code = new byte[]{(byte)code}; + } + public byte[] code() { + return this.code; + } + } + + public static Map STATUS_BY_CODE = + new TreeMap(Bytes.BYTES_COMPARATOR); + static { + for (Status s : Status.values()) { + STATUS_BY_CODE.put(s.code(), s); + } + } + + private FlowKey key; + /** Key for this flow in the flow_queue table (may be null) */ + private FlowQueueKey queueKey; + + private List jobs = new ArrayList(); + + /** Descriptive name for the executing flow */ + private String flowName; + + /** Allow flow user to be set outside of the flow key */ + private String userName; + + /** JSON serialized DAG of the workflow jobs */ + private String jobGraphJSON; + + /** Progress indicator for in-progress flows */ + private int progress; + + /** Number of jobs in this flow */ + private int jobCount; + + /** Number of map tasks in this flow */ + private long totalMaps; + + /** Number of reduce tasks in this flow */ + private long totalReduces; + + /** map file bytes read in this flow */ + private long mapFileBytesRead; + + /** map file bytes written in this flow */ + private long mapFileBytesWritten; + + /** reduce file bytes read in this flow */ + private long reduceFileBytesRead; + + /** HDFS bytes read in this flow */ + private long hdfsBytesRead; + + /** HDFS bytes written in this flow */ + private long hdfsBytesWritten; + + /** map slot millis in this flow */ + private long mapSlotMillis; + + /** reduce slot millis in this flow */ + private long reduceSlotMillis; + + /** reduce shuffle bytes in this flow */ + private long reduceShuffleBytes; + + /** duration/runtime for this flow */ + private long duration; + + /** submit time for this flow (submit time of first job) */ + private long submitTime; + + /** app Version for this flow */ + private String version ; + + /** + * Constructor + * + * @param key + */ + @JsonCreator + public Flow(@JsonProperty("flowKey") FlowKey key) { + this.key = key; + // default flow name to appId + if (this.key != null) { + this.flowName = this.key.getAppId(); + this.userName = this.key.getUserName(); + } + } + + /** + * @param jobKey + * @return whether the given jobKey is part of this flow. + */ + public boolean contains(JobKey jobKey) { + if (jobKey == null || this.key == null) { + return false; + } + + // No need to check for null because jobKey will not return nulls. + return key.equals(jobKey); + } + + /** + * Compares two Flow objects on the basis of their FlowKeys + * + * @param other + * @return 0 if the FlowKeys are equal, + * 1 if this FlowKey greater than other FlowKey, + * -1 if this FlowKey is less than other FlowKey + * + */ + @Override + public int compareTo(Flow otherFlow) { + if (otherFlow == null) { + return -1; + } + return new CompareToBuilder().append(this.key, otherFlow.getFlowKey()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof Flow) { + return compareTo((Flow)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.key) + .toHashCode(); + } + + public FlowKey getFlowKey() { + return key; + } + + public FlowQueueKey getQueueKey() { + return queueKey; + } + + public void setFlowQueueKey(FlowQueueKey queueKey) { + this.queueKey = queueKey; + } + + public String getCluster() { + if (this.key == null) { + return null; + } + return this.key.getCluster(); + } + + public String getAppId() { + if (this.key == null) { + return null; + } + return this.key.getAppId(); + } + + public long getRunId() { + if (this.key == null) { + return 0; + } + return this.key.getRunId(); + } + + public List getJobs() { + return this.jobs; + } + + public void addJob(JobDetails job) { + this.jobs.add(job); + this.jobCount++; + this.totalMaps += job.getTotalMaps(); + this.totalReduces += job.getTotalReduces(); + this.hdfsBytesRead += job.getHdfsBytesRead(); + this.hdfsBytesWritten += job.getHdfsBytesWritten(); + this.reduceShuffleBytes += job.getReduceShuffleBytes(); + this.mapFileBytesRead += job.getMapFileBytesRead(); + this.mapFileBytesWritten += job.getMapFileBytesWritten(); + this.reduceFileBytesRead += job.getReduceFileBytesRead(); + this.mapSlotMillis += job.getMapSlotMillis(); + this.reduceSlotMillis += job.getReduceSlotMillis(); + + // set the submit time of the flow to the submit time of the first job + if ( this.submitTime == 0L ) { + this.submitTime = job.getSubmitTime(); + } + + this.version = job.getVersion(); + } + + public String getUserName() { + return this.userName; + } + + /** + * Allows username to be set before a full {@link FlowKey} can be constructed. + * @param name + */ + public void setUserName(String name) { + this.userName = name; + } + + public String getJobGraphJSON() { + return jobGraphJSON; + } + + public void setJobGraphJSON(String json) { + this.jobGraphJSON = json; + } + + public String getFlowName() { + return this.flowName; + } + + public void setFlowName(String name) { + this.flowName = name; + } + + public int getProgress() { + return this.progress; + } + + public void setProgress(int progress) { + this.progress = progress; + } + + // for JSON deserialiation + void setJobs(List jobs) { this.jobs = jobs; } + + public int getJobCount() { + return jobCount; + } + + public void setJobCount(int jobCount) { + this.jobCount = jobCount; + } + + public long getTotalMaps() { + return totalMaps; + } + + public void setTotalMaps(long totalMaps) { + this.totalMaps = totalMaps; + } + + public long getTotalReduces() { + return totalReduces; + } + + public void setTotalReduces(long totalReduces) { + this.totalReduces = totalReduces; + } + + public long getMapSlotMillis() { + return mapSlotMillis; + } + + public void setMapSlotMillis(long mapSlotMillis) { + this.mapSlotMillis = mapSlotMillis; + } + + public long getReduceSlotMillis() { + return reduceSlotMillis; + } + + public void setReduceSlotMillis(long reduceSlotMillis) { + this.reduceSlotMillis = reduceSlotMillis; + } + + public long getReduceShuffleBytes() { + return reduceShuffleBytes; + } + + public void setReduceShuffleBytes(long reduceShuffleBytes) { + this.reduceShuffleBytes = reduceShuffleBytes; + } + + public long getHdfsBytesRead() { + return hdfsBytesRead; + } + + public void setHdfsBytesRead(long hdfsBytesRead) { + this.hdfsBytesRead = hdfsBytesRead; + } + + public long getHdfsBytesWritten() { + return hdfsBytesWritten; + } + + public void setHdfsBytesWritten(long hdfsBytesWritten) { + this.hdfsBytesWritten = hdfsBytesWritten; + } + + public long getDuration() { + if (this.getJobCount() > 0) { + this.duration = this.getJobs().get(this.getJobCount() - 1).getFinishTime() + - this.getJobs().get(0).getLaunchTime(); + } + return duration; + } + + public void setDuration(long duration) { + this.duration = duration; + } + + public long getSubmitTime() { + return submitTime; + } + + public void setSubmitTime(long submitTime) { + this.submitTime = submitTime; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public long getMapFileBytesRead() { + return mapFileBytesRead; + } + + public void setMapFileBytesRead(long mapFileBytesRead) { + this.mapFileBytesRead = mapFileBytesRead; + } + + public long getMapFileBytesWritten() { + return mapFileBytesWritten; + } + + public void setMapFileBytesWritten(long mapFileBytesWritten) { + this.mapFileBytesWritten = mapFileBytesWritten; + } + + public long getReduceFileBytesRead() { + return reduceFileBytesRead; + } + + public void setReduceFileBytesRead(long reduceFileBytesRead) { + this.reduceFileBytesRead = reduceFileBytesRead; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/FlowEvent.java b/hraven-core/src/main/java/com/twitter/hraven/FlowEvent.java new file mode 100644 index 0000000..81b9bfb --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/FlowEvent.java @@ -0,0 +1,67 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * Represents an event generated during flow execution + */ +public class FlowEvent { + private FlowEventKey key; + private long timestamp; + private Framework framework; + private String type; + private String eventDataJSON; + + public FlowEvent(FlowEventKey key) { + this.key = key; + } + + public FlowEventKey getFlowEventKey() { + return this.key; + } + + public long getTimestamp() { + return timestamp; + } + + public void setTimestamp(long timestamp) { + this.timestamp = timestamp; + } + + public Framework getFramework() { + return framework; + } + + public void setFramework(Framework framework) { + this.framework = framework; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getEventDataJSON() { + return eventDataJSON; + } + + public void setEventDataJSON(String eventDataJSON) { + this.eventDataJSON = eventDataJSON; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/FlowEventKey.java b/hraven-core/src/main/java/com/twitter/hraven/FlowEventKey.java new file mode 100644 index 0000000..1bb7340 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/FlowEventKey.java @@ -0,0 +1,46 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * Key class representing rows in the {@link Constants#FLOW_EVENT_TABLE} table. + */ +public class FlowEventKey extends FlowKey { + private int sequence; + + public FlowEventKey(FlowKey flowKey, int sequence) { + super(flowKey); + this.sequence = sequence; + } + + public FlowEventKey(String cluster, String user, String appId, long runId, int sequence) { + super(cluster, user, appId, runId); + this.sequence = sequence; + } + + public int getSequence() { + return this.sequence; + } + + @Override + public boolean equals(Object other) { + if (other == null || !(other instanceof FlowEventKey)) { + return false; + } + FlowEventKey otherKey = (FlowEventKey)other; + return super.equals(other) && this.sequence == otherKey.sequence; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/FlowKey.java b/hraven-core/src/main/java/com/twitter/hraven/FlowKey.java new file mode 100644 index 0000000..f452cd0 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/FlowKey.java @@ -0,0 +1,150 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.codehaus.jackson.annotate.JsonCreator; +import org.codehaus.jackson.annotate.JsonProperty; + +//Leaving comparable as a raw due to sub-typing/overriding issues. +@SuppressWarnings("rawtypes") +public class FlowKey implements Comparable { + /** + * The cluster on which the job ran. + */ + protected final String cluster; + /** + * Who ran the final map-reduce flow on Hadoop. + */ + protected final String userName; + /** + * Identifying an application, which can go through different versions. + */ + protected final String appId; + /** + * Identifying one single run of a version of an app. Smaller values indicate + * a later run. We're using an inverted timestamp Long.MAXVALUE - + * timstampMillis (milliseconds since January 1, 1970 UTC) + */ + protected long runId; + + @JsonCreator + public FlowKey(@JsonProperty("cluster") String cluster, + @JsonProperty("userName") String userName, + @JsonProperty("appId") String appId, + @JsonProperty("runId") long runId) { + this.cluster = cluster; + this.runId = runId; + this.userName = (null == userName) ? Constants.UNKNOWN : userName.trim(); + this.appId = (null == appId) ? Constants.UNKNOWN : appId.trim(); + } + + public FlowKey(FlowKey toCopy) { + this(toCopy.getCluster(), toCopy.getUserName(), toCopy.getAppId(), toCopy.getRunId()); + } + + /** + * @return The cluster on which the job ran. + */ + public String getCluster() { + return cluster; + } + + /** + * @return Who ran the final map-reduce flow on Hadoop. + */ + public String getUserName() { + return userName; + } + + /** + * @return The thing that identifies an application, such as Pig script + * identifier, or Scalding identifier. + */ + public String getAppId() { + return appId; + } + + /** + * Inverted version of {@link JobKey#getRunId()} + * used in the byte representation for reverse chronological sorting. + * @return + */ + public long getEncodedRunId() { + return encodeRunId(runId); + } + + /** + * Encodes the given timestamp for ordering by run ID + */ + public static long encodeRunId(long timestamp) { + return Long.MAX_VALUE - timestamp; + } + + /** + * @return Identifying one single run of a version of an app. A smaller value + * should indicate a later run. + */ + public long getRunId() { + return runId; + } + + /** + * Compares two FlowKey objects on the basis of + * their cluster, userName, appId and encodedRunId + * + * @param other + * @return 0 if this cluster, userName, appId and encodedRunId are equal to + * the other's cluster, userName, appId and encodedRunId, + * 1 if this cluster or userName or appId or encodedRunId are less than + * the other's cluster, userName, appId and encodedRunId, + * -1 if this cluster and userName and appId and encodedRunId are greater + * the other's cluster, userName, appId and encodedRunId, + * + */ + @Override + public int compareTo(Object other) { + if (other == null) { + return -1; + } + FlowKey otherKey = (FlowKey)other; + return new CompareToBuilder().append(this.cluster, otherKey.getCluster()) + .append(this.userName, otherKey.getUserName()) + .append(this.appId, otherKey.getAppId()) + .append(getEncodedRunId(), otherKey.getEncodedRunId()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof FlowKey) { + return compareTo((FlowKey)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.cluster) + .append(this.userName) + .append(this.appId) + .append(getEncodedRunId()) + .toHashCode(); + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/FlowQueueKey.java b/hraven-core/src/main/java/com/twitter/hraven/FlowQueueKey.java new file mode 100644 index 0000000..f739e96 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/FlowQueueKey.java @@ -0,0 +1,78 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.EqualsBuilder; +import org.apache.commons.lang.builder.ToStringBuilder; + +/** + * Represents the row key for an entry in the flow_queue table. flow_queue rows are keyed by: + * - cluster + * - status code + * - inverted timestamp + * - unique ID + */ +public class FlowQueueKey { + private final String cluster; + private final Flow.Status status; + private final long timestamp; + private final String flowId; + + public FlowQueueKey(String cluster, Flow.Status status, long timestamp, String flowId) { + this.cluster = cluster; + this.status = status; + this.timestamp = timestamp; + this.flowId = flowId; + } + + public String getCluster() { + return cluster; + } + + public Flow.Status getStatus() { + return status; + } + + public long getTimestamp() { + return timestamp; + } + + public String getFlowId() { + return flowId; + } + + @Override + public boolean equals(Object other) { + if (other == null || !(other instanceof FlowQueueKey)) { + return false; + } + FlowQueueKey otherKey = (FlowQueueKey)other; + return new EqualsBuilder().append(this.cluster, otherKey.cluster) + .append(this.status, otherKey.status) + .append(this.timestamp, otherKey.timestamp) + .append(this.flowId, otherKey.flowId) + .isEquals(); + } + + public String toString() { + return new ToStringBuilder(this) + .append(this.cluster) + .append(this.status) + .append(this.timestamp) + .append(this.flowId) + .toString(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Framework.java b/hraven-core/src/main/java/com/twitter/hraven/Framework.java new file mode 100644 index 0000000..29f00bb --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Framework.java @@ -0,0 +1,91 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * Used to distinguish the framework used to launch the map-reduce job with. + */ +public enum Framework { + + /** + * Identifies Pig applications/ pig scripts + */ + PIG("p", "pig"), + /** + * + */ + SCALDING("s", "scalding"), + /** + * + */ + NONE("n", "none, plain map-reduce"); + + // TODO: Add Hive as a framework and at least recognize those jobs as such. + + /** + * The code representing this application type as used in the {@link JobDesc} + */ + private final String code; + + /** + * The description for this {@link Framework} + */ + private final String description; + + /** + * Constructor + * + * @param code + * for this type + * @param description + * for this type + */ + private Framework(String code, String description) { + this.code = code; + this.description = description; + } + + /** + * @return the code corresponding to this type. + */ + public String getCode() { + return code; + } + + /** + * @return the description for this type. + */ + public String getDescription() { + return description; + } + + /** + * Get the {@link Framework} corresponding to this code, or none if not + * specifically Pig or Scalding + * + * @param code + */ + public static Framework get(String code) { + + for (Framework framework : Framework.values()) { + if (framework.getCode().equals(code)) { + return framework; + } + } + return NONE; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobDesc.java b/hraven-core/src/main/java/com/twitter/hraven/JobDesc.java new file mode 100644 index 0000000..06de044 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobDesc.java @@ -0,0 +1,165 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + + +/** + * Describes a Job at a higher level of abstraction than individual JobConf + * entries. Jobs run from different {@code Framework}s can interpret the same + * JobConf entries differently, or conversely, use different JobConf entries to + * denote the same concept. + */ +public class JobDesc { + + /** + * The combined cluster and job ID + */ + private final QualifiedJobId jobId; + + /** + * Who ran the final map-reduce flow on Hadoop. + */ + private final String userName; + + /** + * Identifying an application, which can go through different versions. + */ + private final String appId; + + /** + * The version of this application. An app can go through slight + * modifications. + */ + private final String version; + + /** + * Identifying one single run of a version of an app. Smaller values indicate + * a later run. We're using an inverted timestamp Long.MAXVALUE - + * timstampMillis (milliseconds since January 1, 1970 UTC) + */ + private final long runId; + + /** + * Used to launch the job on the Hadoop cluster. + */ + private final Framework framework; + + /** + * Constructor. Used within the class hierarchy. Consider using @ {code + * JobDescFactory} factory instead. + * + * @param cluster + * the Hadoop cluster on which the job ran. + * @param userName + * the Hadoop user name that ran a job + * @param appId + * The thing that identifies an application, such as Pig script + * identifier, or Scalding identifier. + * @param version + * The verson of this application + * @param runId + * The identifier that ties the various runs for this job together + * @param jobId + * The Hadoop generated MapReduce JobID. + * @param framework + * used to launch the map-reduce job. + */ + JobDesc(String cluster, String userName, String appId, String version, + long runId, String jobId, Framework framework) { + // TODO: Change contract to allow for nulls and advertise strings. + this(new QualifiedJobId(cluster, jobId), userName, appId, version, runId, + framework); + } + + JobDesc(QualifiedJobId jobId, String user, String appId, String version, + long runId, Framework framework) { + this.jobId = jobId; + this.userName = (null == user) ? Constants.UNKNOWN : user.trim(); + this.appId = (null == appId) ? Constants.UNKNOWN : appId.trim(); + this.version = (null == version) ? Constants.UNKNOWN : version.trim(); + this.runId = runId; + this.framework = framework; + } + /** + * @return The fully qualified cluster + job ID + */ + public QualifiedJobId getQualifiedJobId() { + return jobId; + } + + /** + * @return The cluster on which the job ran. + */ + public String getCluster() { + return jobId.getCluster(); + } + + /** + * @return Who ran the final map-reduce flow on Hadoop. + */ + public String getUserName() { + return userName; + } + + /** + * @return The thing that identifies an application, such as Pig script + * identifier, or Scalding identifier. + */ + public String getAppId() { + return appId; + } + + /** + * @return Identifying one single run of a version of an app. A smaller value + * should indicate a later run. + */ + public long getRunId() { + return runId; + } + + /** + * @return Identifying the version of an app. + */ + public String getVersion() { + return version; + } + + /** + * @return The Hadoop map-reduce Job identifier as run on the JobTracker. + */ + public String getJobId() { + return jobId.getJobIdString(); + } + + /** + * @return the famework used to launch this job with on the Hadoop cluster. + */ + public Framework getFramework() { + return framework; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + public String toString() { + return getCluster() + Constants.SEP + this.userName + Constants.SEP + + this.appId + Constants.SEP + this.version + Constants.SEP + + this.runId + Constants.SEP + this.jobId + Constants.SEP + this.framework; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobDescFactory.java b/hraven-core/src/main/java/com/twitter/hraven/JobDescFactory.java new file mode 100644 index 0000000..3bcfe01 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobDescFactory.java @@ -0,0 +1,104 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.hadoop.conf.Configuration; + +/** + * Deal with {@link JobDesc} implementations. + */ +public class JobDescFactory { + /** Key used to identify the jobtracker host in job configurations. */ + public static final String JOBTRACKER_KEY = "mapred.job.tracker"; + + private static final MRJobDescFactory MR_JOB_DESC_FACTORY = new MRJobDescFactory(); + private static final PigJobDescFactory PIG_JOB_DESC_FACTORY = new PigJobDescFactory(); + private static final ScaldingJobDescFactory SCALDING_JOB_DESC_FACTORY = + new ScaldingJobDescFactory(); + + /** + * @param submitTimeMillis + * @param qualifiedJobId + * Identifier for the job for the given {@link Configuration} + * @param jobConf + * the jobConf for the given job. + * @return the job description for the given JobConfiguration. + */ + public static JobDesc createJobDesc(QualifiedJobId qualifiedJobId, + long submitTimeMillis, Configuration jobConf) { + JobDesc jobDesc = null; + + Framework framework = getFramework(jobConf); + + switch (framework) { + case PIG: + jobDesc = PIG_JOB_DESC_FACTORY.create(qualifiedJobId, submitTimeMillis, + jobConf); + break; + case SCALDING: + jobDesc = SCALDING_JOB_DESC_FACTORY.create(qualifiedJobId, submitTimeMillis, + jobConf); + break; + + default: + jobDesc = MR_JOB_DESC_FACTORY.create(qualifiedJobId, submitTimeMillis, + jobConf); + break; + } + + return jobDesc; + } + + /** + * @param jobConf + * a given job configuration. + * @return which framerwork was used to launch that configuration. + */ + public static Framework getFramework(Configuration jobConf) { + // Check if this is a pig job + boolean isPig = jobConf.get(Constants.PIG_CONF_KEY) != null; + if (isPig) { + return Framework.PIG; + } else { + String flowId = jobConf.get(Constants.CASCADING_FLOW_ID_CONF_KEY); + if ((flowId == null) || (flowId.length() == 0)) { + return Framework.NONE; + } else { + return Framework.SCALDING; + } + } + } + + /** + * Returns the cluster that a give job was run on by mapping the jobtracker hostname to an + * identifier. + * @param jobConf + * @return + */ + public static String getCluster(Configuration jobConf) { + String jobtracker = jobConf.get(JOBTRACKER_KEY); + // strip any port number + int portIdx = jobtracker.indexOf(':'); + if (portIdx > -1) { + jobtracker = jobtracker.substring(0, portIdx); + } + // An ExceptionInInitializerError may be thrown to indicate that an exception occurred during + // evaluation of Cluster class' static initialization + String cluster = Cluster.getIdentifier(jobtracker); + return cluster != null ? cluster: null; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobDescFactoryBase.java b/hraven-core/src/main/java/com/twitter/hraven/JobDescFactoryBase.java new file mode 100644 index 0000000..933d453 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobDescFactoryBase.java @@ -0,0 +1,127 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.hadoop.conf.Configuration; + +import com.twitter.hraven.util.StringUtil; + +/** + * Provide functionality that is common to the various JobKeyFactory classes. + */ +public abstract class JobDescFactoryBase { + + /** + * Default constructor. + */ + public JobDescFactoryBase() { + } + + /** + * @param qualifiedJobId + * jobId qualified with cluster. + * @param submitTimeMillis + * the job, script or flow submit time in milliseconds since January + * 1, 1970 UTC + * @param jobConf + * of the job. + * @return the identifier for the job in the JobHistory table. + */ + abstract JobDesc create(QualifiedJobId qualifiedJobId, long submitTimeMillis, + Configuration jobConf); + + /** + * Factory method to be used by subclasses. + * + * @param qualifiedJobId + * Identifying the cluster and the jobId. Cannot be null; + * @param jobConf + * the Job configuration of the job + * @param appId + * The thing that identifies an application, such as Pig script + * identifier, or Scalding identifier. + * @param version + * @param framework used to launch the map-reduce job. + * @param submitTimeMillis + * Identifying one single run of a version of an app. + * @return a JobKey with the given parameters and the userName added. + */ + protected JobDesc create(QualifiedJobId qualifiedJobId, Configuration jobConf, + String appId, String version, Framework framework, long submitTimeMillis) { + + if (null == qualifiedJobId) { + throw new IllegalArgumentException( + "Cannot create a JobKey from a null qualifiedJobId."); + } + + // Add individual fields. + String userName = jobConf.get(Constants.USER_CONF_KEY); + + return new JobDesc(qualifiedJobId, userName, appId, version, + submitTimeMillis, framework); + } + + /** + * @param jobConf + * from which to pull the properties + * @return a non-empty non-null string with the jobId. If the jobId cannot be + * parsed, then {@link Constants#UNKNOWN} will be returned. + */ + protected String getAppId(Configuration jobConf) { + + // Defensive coding + if (jobConf == null) { + return Constants.UNKNOWN; + } + + String appId = jobConf.get(Constants.APP_NAME_CONF_KEY); + + // If explicit app name isn't set, try to parse it from mapred.job.name + if (appId == null) { + appId = jobConf.get(Constants.JOB_NAME_CONF_KEY); + if (appId != null) { + // Allow sub-classes to transform. + appId = getAppIdFromJobName(appId); + } + } + + return cleanAppId(appId); + } + + /** + * Given a potential value for appId, return a string that is safe to use in + * the jobKey + * + * @param appId + * possibly null value. + * @return non-null value stripped of separators that are used in the jobKey. + */ + protected String cleanAppId(String appId) { + return (appId != null) ? StringUtil.cleanseToken(appId) : Constants.UNKNOWN; + } + + /** + * Subclasses are to implement this method to strip the jobId from a jobName. + * This allows separate implementations to treat the name differently. + * + * @param jobName + * on-null name of the job + * @return the AppId given the jobName. Note that delimiters and spaces will + * be stripped from the result to avoid clashes in the key. + */ + abstract String getAppIdFromJobName(String jobName); + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobDetails.java b/hraven-core/src/main/java/com/twitter/hraven/JobDetails.java new file mode 100644 index 0000000..b1b69d8 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobDetails.java @@ -0,0 +1,502 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.NavigableMap; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; +import org.codehaus.jackson.annotate.JsonCreator; +import org.codehaus.jackson.annotate.JsonProperty; +import org.codehaus.jackson.map.annotate.JsonSerialize; + +import com.twitter.hraven.datasource.JobHistoryService; + +/** + * Represents the configuration, statistics, and counters from a single + * map reduce job. Individual task details are also nested, though may not + * be loaded in all cases. + */ +@JsonSerialize( + include= JsonSerialize.Inclusion.NON_NULL +) +public class JobDetails implements Comparable { + @SuppressWarnings("unused") + private static Log LOG = LogFactory.getLog(JobDetails.class); + + // job key -- maps to row key + private JobKey jobKey; + + // unique job ID assigned by job tracker + private String jobId; + + // job-level stats + private String jobName; + private String user; + private String priority; + private String status; + private String version; + private long submitTime; + private long launchTime; + private long finishTime; + private long totalMaps; + private long totalReduces; + private long finishedMaps; + private long finishedReduces; + private long failedMaps; + private long failedReduces; + private long mapFileBytesRead; + private long mapFileBytesWritten; + private long reduceFileBytesRead; + private long hdfsBytesRead; + private long hdfsBytesWritten; + private long mapSlotMillis; + private long reduceSlotMillis; + private long reduceShuffleBytes; + + // job config + private Configuration config; + + // job-level counters + private CounterMap counters = new CounterMap(); + private CounterMap mapCounters = new CounterMap(); + private CounterMap reduceCounters = new CounterMap(); + + // tasks + private List tasks = new ArrayList(); + + @JsonCreator + public JobDetails(@JsonProperty("jobKey") JobKey key) { + this.jobKey = key; + } + + @Override + public boolean equals(Object other) { + if (other instanceof JobDetails) { + return compareTo((JobDetails)other) == 0; + } + return false; + } + + /** + * Compares two JobDetails objects on the basis of their JobKey + * + * @param other + * @return 0 if this JobKey is equal to the other JobKey, + * 1 if this JobKey greater than other JobKey, + * -1 if this JobKey is less than other JobKey + * + */ + @Override + public int compareTo(JobDetails otherJob) { + if (otherJob == null) { + return -1; + } + return new CompareToBuilder().append(this.jobKey, otherJob.getJobKey()) + .toComparison(); + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.jobKey) + .toHashCode(); + } + + public JobKey getJobKey() { + return this.jobKey; + } + + public String getJobId() { + return jobId; + } + + public void setJobId(String jobId) { + this.jobId = jobId; + } + + public String getJobName() { + return jobName; + } + + public void setJobName(String jobName) { + this.jobName = jobName; + } + + public String getUser() { + return user; + } + + public void setUser(String user) { + this.user = user; + } + + public String getPriority() { + return priority; + } + + public void setPriority(String priority) { + this.priority = priority; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public long getSubmitTime() { + return submitTime; + } + + public void setSubmitTime(long submitTime) { + this.submitTime = submitTime; + } + + public Date getSubmitDate() { + return new Date(this.submitTime); + } + + public long getLaunchTime() { + return launchTime; + } + + public void setLaunchTime(long launchTime) { + this.launchTime = launchTime; + } + + public Date getLaunchDate() { + return new Date(this.launchTime); + } + + public long getFinishTime() { + return finishTime; + } + + public void setFinishTime(long finishTime) { + this.finishTime = finishTime; + } + + public Date getFinishDate() { + return new Date(this.finishTime); + } + + /** + * Returns the elapsed run time for this job (finish time minus launch time). + * @return + */ + public long getRunTime() { + return finishTime - launchTime; + } + + public long getTotalMaps() { + return totalMaps; + } + + public void setTotalMaps(long totalMaps) { + this.totalMaps = totalMaps; + } + + public long getTotalReduces() { + return totalReduces; + } + + public void setTotalReduces(long totalReduces) { + this.totalReduces = totalReduces; + } + + public long getFinishedMaps() { + return finishedMaps; + } + + public void setFinishedMaps(long finishedMaps) { + this.finishedMaps = finishedMaps; + } + + public long getFinishedReduces() { + return finishedReduces; + } + + public void setFinishedReduces(long finishedReduces) { + this.finishedReduces = finishedReduces; + } + + public long getFailedMaps() { + return failedMaps; + } + + public void setFailedMaps(long failedMaps) { + this.failedMaps = failedMaps; + } + + public long getFailedReduces() { + return failedReduces; + } + + public void setFailedReduces(long failedReduces) { + this.failedReduces = failedReduces; + } + + public long getMapFileBytesRead() { + return mapFileBytesRead; + } + + public void setMapFileBytesRead(long mapFileBytesRead) { + this.mapFileBytesRead = mapFileBytesRead; + } + + public long getMapFileBytesWritten() { + return mapFileBytesWritten; + } + + public void setMapFileBytesWritten(long mapBytesWritten) { + this.mapFileBytesWritten = mapBytesWritten; + } + + public long getHdfsBytesRead() { + return hdfsBytesRead; + } + + public long getMapSlotMillis() { + return mapSlotMillis; + } + + public void setMapSlotMillis(long mapSlotMillis) { + this.mapSlotMillis = mapSlotMillis; + } + + public long getReduceSlotMillis() { + return reduceSlotMillis; + } + + public void setReduceSlotMillis(long reduceSlotMillis) { + this.reduceSlotMillis = reduceSlotMillis; + } + + public long getReduceShuffleBytes() { + return reduceShuffleBytes; + } + + public void setReduceShuffleBytes(long reduceShuffleBytes) { + this.reduceShuffleBytes = reduceShuffleBytes; + } + + public long getReduceFileBytesRead() { + return reduceFileBytesRead; + } + + public void setReduceFileBytesRead(long reduceFileBytesRead) { + this.reduceFileBytesRead = reduceFileBytesRead; + } + + public long getHdfsBytesWritten() { + return hdfsBytesWritten; + } + + public void setHdfsBytesWritten(long hdfsBytesWritten) { + this.hdfsBytesWritten = hdfsBytesWritten; + } + + public void setHdfsBytesRead(long hdfsBytesRead) { + this.hdfsBytesRead = hdfsBytesRead; + } + + public void addTask(TaskDetails task) { + this.tasks.add(task); + } + + public List getTasks() { + return this.tasks; + } + + public Configuration getConfiguration() { + return this.config; + } + + public CounterMap getCounters() { + return this.counters; + } + + public CounterMap getMapCounters() { + return this.mapCounters; + } + + public CounterMap getReduceCounters() { + return this.reduceCounters; + } + + // for JSON deserialization + void setConfiguration(Configuration config) { this.config = config; } + void setCounters(CounterMap counters) { this.counters = counters; } + void setMapCounters(CounterMap mapCounters) { this.mapCounters = mapCounters; } + void setReduceCounters(CounterMap reduceCounters) { this.reduceCounters = reduceCounters; } + + /** TODO: refactor this out into a data access layer */ + public void populate(Result result) { + // process job-level stats and properties + NavigableMap infoValues = result.getFamilyMap(Constants.INFO_FAM_BYTES); + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOBID))) { + this.jobId = Bytes.toString( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOBID))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.USER))) { + this.user = Bytes.toString( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.USER))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOBNAME))) { + this.jobName = Bytes.toString( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOBNAME))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOB_PRIORITY))) { + this.priority = Bytes.toString( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOB_PRIORITY))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOB_STATUS))) { + this.status = Bytes.toString( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOB_STATUS))); + } + if (infoValues.containsKey(Constants.VERSION_COLUMN_BYTES)) { + this.version = Bytes.toString(infoValues.get(Constants.VERSION_COLUMN_BYTES)); + } + + // times + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.SUBMIT_TIME))) { + this.submitTime = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.SUBMIT_TIME))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.LAUNCH_TIME))) { + this.launchTime = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.LAUNCH_TIME))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISH_TIME))) { + this.finishTime = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISH_TIME))); + } + + // task counts + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_MAPS))) { + this.totalMaps = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_MAPS))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_REDUCES))) { + this.totalReduces = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_REDUCES))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISHED_MAPS))) { + this.finishedMaps = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISHED_MAPS))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISHED_REDUCES))) { + this.finishedReduces = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISHED_REDUCES))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FAILED_MAPS))) { + this.failedMaps = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FAILED_MAPS))); + } + if (infoValues.containsKey(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FAILED_REDUCES))) { + this.failedReduces = Bytes.toLong( + infoValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FAILED_REDUCES))); + } + + this.config = JobHistoryService.parseConfiguration(infoValues); + this.counters = JobHistoryService.parseCounters( + Constants.COUNTER_COLUMN_PREFIX_BYTES, infoValues); + this.mapCounters = JobHistoryService.parseCounters( + Constants.MAP_COUNTER_COLUMN_PREFIX_BYTES, infoValues); + this.reduceCounters = JobHistoryService.parseCounters( + Constants.REDUCE_COUNTER_COLUMN_PREFIX_BYTES, infoValues); + + // populate stats from counters for this job + // map file bytes read + if (this.mapCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_READ) != null) { + this.mapFileBytesRead = + this.mapCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_READ) + .getValue(); + } + + // map file bytes written + if (this.mapCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_WRITTEN) != null) { + this.mapFileBytesWritten = + this.mapCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_WRITTEN) + .getValue(); + } + + // reduce file bytes read + if (this.reduceCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_READ) != null) { + this.reduceFileBytesRead = + this.reduceCounters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.FILES_BYTES_READ) + .getValue(); + } + + // hdfs bytes read + if (this.counters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.HDFS_BYTES_READ) != null) { + this.hdfsBytesRead = + this.counters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.HDFS_BYTES_READ) + .getValue(); + } + + // hdfs bytes written + if (this.counters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.HDFS_BYTES_WRITTEN) != null) { + this.hdfsBytesWritten = + this.counters.getCounter(Constants.FILESYSTEM_COUNTERS, Constants.HDFS_BYTES_WRITTEN) + .getValue(); + } + + // map slot millis + if (this.counters.getCounter(Constants.JOBINPROGRESS_COUNTER, Constants.SLOTS_MILLIS_MAPS) != null) { + this.mapSlotMillis = + this.counters.getCounter(Constants.JOBINPROGRESS_COUNTER, Constants.SLOTS_MILLIS_MAPS) + .getValue(); + } + + // reduce slot millis + if (this.counters.getCounter(Constants.JOBINPROGRESS_COUNTER, Constants.SLOTS_MILLIS_REDUCES) != null) { + this.reduceSlotMillis = + this.counters.getCounter(Constants.JOBINPROGRESS_COUNTER, Constants.SLOTS_MILLIS_REDUCES) + .getValue(); + } + + // reduce shuffle bytes + if (this.reduceCounters.getCounter(Constants.TASK_COUNTER, Constants.REDUCE_SHUFFLE_BYTES) != null) { + this.reduceShuffleBytes = + this.reduceCounters.getCounter(Constants.TASK_COUNTER, Constants.REDUCE_SHUFFLE_BYTES) + .getValue(); + } + + // populate the task-level data + //populateTasks(result.getFamilyMap(Constants.TASK_FAM_BYTES)); + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobHistoryKeys.java b/hraven-core/src/main/java/com/twitter/hraven/JobHistoryKeys.java new file mode 100644 index 0000000..1e3f920 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobHistoryKeys.java @@ -0,0 +1,99 @@ +package com.twitter.hraven; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Contains the extract of the keys enum from + * {@link org.apache.hadoop.mapreduce.JobHistoryCopy} class + * for hadoop1 + * + * Job history files contain key="value" pairs in hadoop1, + * where keys belong to this enum + * This class acts as a global namespace for all keys. + * + * *TODO* + * When job history keys are added/removed from original enum, + * this class may need to be modified as well. + * (just like the same goes for + * {@link org.apache.hadoop.mapreduce.JobHistoryCopy} as well + * in hraven etl for hadoop1) + * + */ + +public enum JobHistoryKeys { + JOBTRACKERID, START_TIME, FINISH_TIME, + JOBID, JOBNAME, USER, JOBCONF, SUBMIT_TIME, + LAUNCH_TIME, TOTAL_MAPS, TOTAL_REDUCES, + FAILED_MAPS, FAILED_REDUCES, + FINISHED_MAPS, FINISHED_REDUCES, + JOB_STATUS, TASKID, HOSTNAME, TASK_TYPE, + ERROR, TASK_ATTEMPT_ID, TASK_STATUS, + COPY_PHASE, SORT_PHASE, REDUCE_PHASE, + SHUFFLE_FINISHED, SORT_FINISHED, COUNTERS, + SPLITS, JOB_PRIORITY, HTTP_PORT, + TRACKER_NAME, STATE_STRING, VERSION, + MAP_COUNTERS, REDUCE_COUNTERS, + VIEW_JOB, MODIFY_JOB, JOB_QUEUE; + + /** + * Job history key names as bytes + */ + public static final Map KEYS_TO_BYTES = + new HashMap(); + static { + for (JobHistoryKeys k : JobHistoryKeys.values()) { + KEYS_TO_BYTES.put(k, Bytes.toBytes(k.toString().toLowerCase())); + } + } + + /** + * Data types represented by each of the defined job history field names + */ + @SuppressWarnings("rawtypes") + public static Map KEY_TYPES = new HashMap(); + static { + KEY_TYPES.put(JOBTRACKERID, String.class); + KEY_TYPES.put(START_TIME, Long.class); + KEY_TYPES.put(FINISH_TIME, Long.class); + KEY_TYPES.put(JOBID, String.class); + KEY_TYPES.put(JOBNAME, String.class); + KEY_TYPES.put(USER, String.class); + KEY_TYPES.put(JOBCONF, String.class); + KEY_TYPES.put(SUBMIT_TIME, Long.class); + KEY_TYPES.put(LAUNCH_TIME, Long.class); + KEY_TYPES.put(TOTAL_MAPS, Long.class); + KEY_TYPES.put(TOTAL_REDUCES, Long.class); + KEY_TYPES.put(FAILED_MAPS, Long.class); + KEY_TYPES.put(FAILED_REDUCES, Long.class); + KEY_TYPES.put(FINISHED_MAPS, Long.class); + KEY_TYPES.put(FINISHED_REDUCES, Long.class); + KEY_TYPES.put(JOB_STATUS, String.class); + KEY_TYPES.put(TASKID, String.class); + KEY_TYPES.put(HOSTNAME, String.class); + KEY_TYPES.put(TASK_TYPE, String.class); + KEY_TYPES.put(ERROR, String.class); + KEY_TYPES.put(TASK_ATTEMPT_ID, String.class); + KEY_TYPES.put(TASK_STATUS, String.class); + KEY_TYPES.put(COPY_PHASE, String.class); + KEY_TYPES.put(SORT_PHASE, String.class); + KEY_TYPES.put(REDUCE_PHASE, String.class); + KEY_TYPES.put(SHUFFLE_FINISHED, Long.class); + KEY_TYPES.put(SORT_FINISHED, Long.class); + KEY_TYPES.put(COUNTERS, String.class); + KEY_TYPES.put(SPLITS, String.class); + KEY_TYPES.put(JOB_PRIORITY, String.class); + KEY_TYPES.put(HTTP_PORT, Integer.class); + KEY_TYPES.put(TRACKER_NAME, String.class); + KEY_TYPES.put(STATE_STRING, String.class); + KEY_TYPES.put(VERSION, String.class); + KEY_TYPES.put(MAP_COUNTERS, String.class); + KEY_TYPES.put(REDUCE_COUNTERS, String.class); + KEY_TYPES.put(VIEW_JOB, String.class); + KEY_TYPES.put(MODIFY_JOB, String.class); + KEY_TYPES.put(JOB_QUEUE, String.class); + } + +} \ No newline at end of file diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobId.java b/hraven-core/src/main/java/com/twitter/hraven/JobId.java new file mode 100644 index 0000000..887a0a5 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobId.java @@ -0,0 +1,139 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.codehaus.jackson.annotate.JsonCreator; +import org.codehaus.jackson.annotate.JsonProperty; + +/** + * Job identifier with individual elements of the jobtracker assigned ID parsed + * apart. The jobtracker ID is parsed as: job_[epoch]_[sequence] + * + */ +public class JobId implements Comparable { + protected static final String JOB_ID_SEP = "_"; + /** + * The jobtracker start time from the job ID, obtained from parsing the + * center component of the job ID. + */ + protected long jobEpoch; + /** + * The jobtracker assigned sequence number for the job, obtained from parsing + * the last component of the job ID. + */ + protected long jobSequence; + + @JsonCreator + public JobId(@JsonProperty("jobIdString") String jobId) { + if (jobId != null) { + String[] elts = jobId.trim().split(JOB_ID_SEP); + try { + this.jobEpoch = Long.parseLong(elts[1]); + this.jobSequence = Long.parseLong(elts[2]); + } catch (Exception e) { + throw new IllegalArgumentException("Invalid job ID '"+jobId+ + "', must be in the format 'job_[0-9]+_[0-9]+'"); + } + } + } + + public JobId(long epoch, long seq) { + this.jobEpoch = epoch; + this.jobSequence = seq; + } + + public JobId(JobId idToCopy) { + if (idToCopy != null) { + this.jobEpoch = idToCopy.getJobEpoch(); + this.jobSequence = idToCopy.getJobSequence(); + } + } + + /** + * Returns the epoch value from the job ID. The epoch value is generated by simply + * parsing the date formatted jobtracker start time as a long value. + * @return + */ + public long getJobEpoch() { + return jobEpoch; + } + + /** + * Returns the job sequence number obtained from the final component of the job ID. + * The counter used to assign the sequence number is reset on every jobtracker + * restart, so sequence values will overlap within the same cluster. In order + * to ensure uniqueness of job IDs, the epoch and sequence values must be + * combined. + * @return + */ + public long getJobSequence() { + return jobSequence; + } + + public String getJobIdString() { + return String.format("job_%d_%04d", this.jobEpoch, this.jobSequence); + } + + public String toString() { + return getJobIdString(); + } + + /** + * Compares two JobId objects on the basis of their + * jobEpoch (jobtracker start time from the job ID) + * and + * jobSequence( jobtracker assigned sequence number for the job,) + * + * @param other + * @return 0 if this jobEpoch and jobSequence are equal to + * other jobEpoch and jobSequence, + * 1 if this jobEpoch and jobSequence are greater than + * other jobEpoch and jobSequence, + * -1 if this jobEpoch and jobSequence less than + * other jobEpoch and jobSequence + * + */ + @Override + public int compareTo(JobId o) { + if (o == null) { + // nulls sort last + return -1; + } + + return new CompareToBuilder() + .append(this.jobEpoch, o.getJobEpoch()) + .append(this.jobSequence, o.getJobSequence()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other != null && other instanceof JobId) { + return compareTo((JobId)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.jobEpoch) + .append(this.jobSequence) + .toHashCode(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/JobKey.java b/hraven-core/src/main/java/com/twitter/hraven/JobKey.java new file mode 100644 index 0000000..5e094ce --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/JobKey.java @@ -0,0 +1,149 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.codehaus.jackson.annotate.JsonCreator; +import org.codehaus.jackson.annotate.JsonProperty; + +/** + * Represents the row key for a given job. Row keys are stored as: username ! + * appid ! version ! runid ! jobid + */ +@SuppressWarnings("rawtypes") +public class JobKey extends FlowKey implements Comparable{ + + /** + * Fully qualified cluster + parsed job identifier + */ + private final QualifiedJobId jobId; + + /** + * Constructor. + * + * @param cluster + * the Hadoop cluster on which the job ran. + * @param userName + * the Hadoop user name that ran a job + * @param appId + * The thing that identifies an application, such as Pig script + * identifier, or Scalding identifier. + * @param runId + * The identifier that ties the various runs for this job together + * @param jobId + * The Hadoop generated MapReduce JobID. + */ + public JobKey(String cluster, String userName, String appId, long runId, + String jobId) { + // TODO: Change contract to allow for nulls and advertise strings. + this(new QualifiedJobId(cluster, jobId), userName, appId, runId); + } + + @JsonCreator + public JobKey(@JsonProperty("cluster") String cluster, + @JsonProperty("userName") String userName, + @JsonProperty("appId") String appId, + @JsonProperty("runId") long runId, + @JsonProperty("jobId") JobId jobId) { + this(new QualifiedJobId(cluster, jobId), userName, appId, runId); + } + + /** + * Creates a new JobKey from the given parameters + * + * @param qualifiedJobId The combined cluster + job ID + * @param userName The user name that ran the job + * @param appId The application identifier + * @param runId The run timestamp + */ + public JobKey(QualifiedJobId qualifiedJobId, String userName, String appId, + long runId) { + super(qualifiedJobId.getCluster(), userName, appId, runId); + this.jobId = qualifiedJobId; + } + + /** + * Constructor. + * + * @param jobDesc + * from which to construct this JobKey. + */ + public JobKey(JobDesc jobDesc) { + this(jobDesc.getCluster(), jobDesc.getUserName(), jobDesc.getAppId(), + jobDesc.getRunId(), jobDesc.getJobId()); + } + + /** + * @return The fully qualified cluster + parsed job ID + */ + public QualifiedJobId getQualifiedJobId() { + return jobId; + } + + /** + * @return The Hadoop map-reduce Job identifier as run on the JobTracker. + */ + public JobId getJobId() { + return jobId; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + public String toString() { + return getCluster() + Constants.SEP + this.userName + Constants.SEP + + this.appId + Constants.SEP + this.getRunId() + + Constants.SEP + this.jobId.getJobIdString(); + } + + /** + * Compares two JobKey QualifiedJobId + * + * @param other + * @return 0 if the Qualified Job Ids are equal, + * 1 if this QualifiedJobId greater than other QualifiedJobId, + * -1 if this QualifiedJobId is less than other QualifiedJobId + */ + @Override + public int compareTo(Object other) { + if (other == null) { + return -1; + } + JobKey otherKey = (JobKey)other; + return new CompareToBuilder().appendSuper(super.compareTo(otherKey)) + .append(this.jobId, otherKey.getJobId()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof JobKey) { + return compareTo((JobKey)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder().appendSuper(super.hashCode()) + .append(this.jobId) + .toHashCode(); + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/MRJobDescFactory.java b/hraven-core/src/main/java/com/twitter/hraven/MRJobDescFactory.java new file mode 100644 index 0000000..a3f21e0 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/MRJobDescFactory.java @@ -0,0 +1,62 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.hadoop.conf.Configuration; + +/** + * Used to create {@link JobKey} instances that can deal with + * {@link Configuration} file (contents) for {@link Framework#NONE} + * + */ +public class MRJobDescFactory extends JobDescFactoryBase { + + /* + * (non-Javadoc) + * + * @see + * com.twitter.hraven.JobKeyFactoryBase#create(com.twitter.corestorage + * .rhaven.QualifiedJobId, long, org.apache.hadoop.conf.Configuration) + */ + JobDesc create(QualifiedJobId qualifiedJobId, long submitTimeMillis, + Configuration jobConf) { + // TODO: Get the actual values appropriate for the plain Hadoop jobs. + + String appId = getAppId(jobConf); + + long appSubmitTimeMillis = jobConf.getLong(Constants.MR_RUN_CONF_KEY, + submitTimeMillis); + + + return create(qualifiedJobId, jobConf, appId, Constants.UNKNOWN, + Framework.NONE, appSubmitTimeMillis); + } + + /* + * (non-Javadoc) + * + * @see + * com.twitter.hraven.JobDescFactoryBase#getAppIdFromJobName(java.lang.String) + */ + String getAppIdFromJobName(String jobName) { + int firstOpenBracketPos = jobName.indexOf("["); + if (firstOpenBracketPos > -1) { + return jobName.substring(0, firstOpenBracketPos); + } + return jobName; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/PigJobDescFactory.java b/hraven-core/src/main/java/com/twitter/hraven/PigJobDescFactory.java new file mode 100644 index 0000000..08f58b4 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/PigJobDescFactory.java @@ -0,0 +1,109 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; + +/** + * Used to {@link JobKey} instances that can deal with {@link Configuration} + * file (contents) for {@link Framework#PIG} + */ +public class PigJobDescFactory extends JobDescFactoryBase { + + private static Pattern scheduledJobnamePattern = Pattern + .compile(Constants.PIG_SCHEDULED_JOBNAME_PATTERN_REGEX); + private static Pattern pigLogfilePattern = Pattern + .compile(Constants.PIG_LOGFILE_PATTERN_REGEX); + + // TODO: Make this configurable + public static final String SCHEDULED_PREFIX = "oink "; + + /* + * (non-Javadoc) + * + * @see + * com.twitter.hraven.JobKeyFactoryBase#create(com.twitter.corestorage + * .rhaven.QualifiedJobId, long, org.apache.hadoop.conf.Configuration) + */ + public JobDesc create(QualifiedJobId qualifiedJobId, long submitTimeMillis, + Configuration jobConf) { + String appId = getAppId(jobConf); + String version = jobConf.get(Constants.PIG_VERSION_CONF_KEY, + Constants.UNKNOWN); + long pigSubmitTimeMillis = jobConf.getLong(Constants.PIG_RUN_CONF_KEY, 0); + + // This means that Constants.PIG_RUN_CONF_KEY was not present (for jobs + // launched with an older pig version). + if (pigSubmitTimeMillis == 0) { + String pigLogfile = jobConf.get(Constants.PIG_LOG_FILE_CONF_KEY); + if (pigLogfile == null) { + // Should be rare, but we're seeing this happen occasionally + // Give up on grouping the jobs within the run together, and treat these as individual runs. + pigSubmitTimeMillis = submitTimeMillis; + } else { + pigSubmitTimeMillis = getScriptStartTimeFromLogfileName(pigLogfile); + } + } + + return create(qualifiedJobId, jobConf, appId, version, Framework.PIG, + pigSubmitTimeMillis); + } + + /* + * (non-Javadoc) + * + * @see + * com.twitter.hraven.JobDescFactoryBase#getAppIdFromJobName(java.lang.String) + */ + String getAppIdFromJobName(String jobName) { + if (jobName == null) { + return null; + } + + Matcher matcher = scheduledJobnamePattern.matcher(jobName); + + // TODO: Externalize patterns to make them configurable + if (matcher.matches()) { + jobName = SCHEDULED_PREFIX + matcher.group(1); + } + + return jobName; + } + + /** + * @param pigLogfile + * as obtained from the JobConfig + * @return + */ + public static long getScriptStartTimeFromLogfileName(String pigLogfile) { + long pigSubmitTimeMillis = 0; + + if (pigLogfile == null) { + return pigSubmitTimeMillis; + } + + Matcher matcher = pigLogfilePattern.matcher(pigLogfile); + if (matcher.matches()) { + String submitTimeMillisString = matcher.group(1); + pigSubmitTimeMillis = Long.parseLong(submitTimeMillisString); + } + return pigSubmitTimeMillis; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/QualifiedJobId.java b/hraven-core/src/main/java/com/twitter/hraven/QualifiedJobId.java new file mode 100644 index 0000000..9feabca --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/QualifiedJobId.java @@ -0,0 +1,55 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * The job ID should be relatively unique, unless two clusters start at the same + * time. However, given a jobId it is not immediately clear which cluster a job + * ran on (unless the cluster has not been restarted and the prefix is still the + * current one). This class represents the fully qualified job identifier. + * + */ +public class QualifiedJobId extends JobId { + + /** + * The Hadoop cluster on which the job ran. + */ + private final String cluster; + + /** + * Constructor. + * + * @param cluster + * @param jobId + */ + public QualifiedJobId(String cluster, String jobId) { + super(jobId); + this.cluster = (cluster != null ? cluster.trim() : ""); + } + + public QualifiedJobId(String cluster, JobId jobId) { + super(jobId); + this.cluster = (cluster != null ? cluster.trim() : ""); + } + + /** + * @return The Hadoop cluster on which the job ran. + */ + public String getCluster() { + return cluster; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/Range.java b/hraven-core/src/main/java/com/twitter/hraven/Range.java new file mode 100644 index 0000000..635e091 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/Range.java @@ -0,0 +1,60 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +/** + * A range of (sorted) items with a min and a max. + */ +public class Range { + + /** + * The minimum item of class {@link E} in this range. + */ + private final E min; + + /** + * The maximum item of class {@link E} in this range. + */ + private final E max; + + /** + * Constructs a range + * + * @param min + * the minimum of this range + * @param max + * the maximum of this range + */ + public Range(E min, E max) { + this.min = min; + this.max = max; + } + + /** + * @return the min of the range + */ + public E getMin() { + return min; + } + + /** + * @return the max of the range. + */ + public E getMax() { + return max; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/ScaldingJobDescFactory.java b/hraven-core/src/main/java/com/twitter/hraven/ScaldingJobDescFactory.java new file mode 100644 index 0000000..8e6dc73 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/ScaldingJobDescFactory.java @@ -0,0 +1,138 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; + +import com.twitter.hraven.util.DateUtil; + +/** + * Used to create {@link JobKey} instances that can deal with + * {@link Configuration} file (contents) for {@link Framework#SCALDING} + * + */ +public class ScaldingJobDescFactory extends JobDescFactoryBase { + + /** Regex to clear out any portion of the job name contained in square brackets */ + private Pattern stripBracketsPattern = Pattern.compile("\\[.*\\]\\s*"); + /** Regex to strip any remaining job sequence information from the app ID */ + private Pattern stripSequencePattern = Pattern.compile("^(.*)/\\(\\d+/\\d+\\).*$"); + + @Override + JobDesc create(QualifiedJobId qualifiedJobId, long submitTimeMillis, + Configuration jobConf) { + + String appId = getAppId(jobConf); + if (Constants.UNKNOWN.equals(appId)) { + // Fall back to cascading.app.id, it's not readable but should be correct + appId = cleanAppId(jobConf.get(Constants.CASCADING_APP_ID_CONF_KEY)); + } + + String version = jobConf.get(Constants.CASCADING_VERSION_CONF_KEY); + // TODO: What to put for older flows that do not contain this? + + // TODO: Figure out how to get a proper flow submit time for Scalding jobs. + // For now, hack something together from the flowId + long scaldingSubmitTimeMillis = getFlowSubmitTimeMillis(jobConf, + submitTimeMillis); + + return create(qualifiedJobId, jobConf, appId, version, + Framework.SCALDING, scaldingSubmitTimeMillis); + } + + @Override + String getAppIdFromJobName(String jobName) { + return stripAppId(jobName); + } + + /** + * Strips out metadata in brackets to get a clean app name. There are multiple job name formats + * used by various frameworks. This method attempts to normalize these job names into a somewhat + * human readable appId format. + */ + String stripAppId(String origId) { + if (origId == null || origId.isEmpty()) { + return ""; + } + Matcher m = stripBracketsPattern.matcher(origId); + String cleanedAppId = m.replaceAll(""); + Matcher tailMatcher = stripSequencePattern.matcher(cleanedAppId); + if (tailMatcher.matches()) { + cleanedAppId = tailMatcher.group(1); + } + return cleanedAppId; + } + + /** + * Returns the flow submit time for this job or a computed substitute that + * will at least be consistent for all jobs in a flow. + * + * The time is computed according to: + *

    + *
  1. use "scalding.flow.submitted.timestamp" if present
  2. + *
  3. otherwise use "cascading.flow.id" as a substitute
  4. + *
+ * + * @param jobConf + * The job configuration + * @param submitTimeMillis + * of a individual job in the flow + * @return when the entire flow started, or else at least something that binds + * all jobs in a flow together. + */ + static long getFlowSubmitTimeMillis(Configuration jobConf, + long submitTimeMillis) { + // TODO: Do some parsing / hacking on this. + // Grab the year/month component and add part of the flowId turned into long + // kind of a thing. + + long cascadingSubmitTimeMillis = jobConf.getLong( + Constants.CASCADING_RUN_CONF_KEY, 0); + + if (cascadingSubmitTimeMillis == 0) { + // Convert hex encoded flow ID (128-bit MD5 hash) into long as a substitute + String flowId = jobConf.get(Constants.CASCADING_FLOW_ID_CONF_KEY); + if (flowId != null && !flowId.isEmpty()) { + if (flowId.length() > 16) { + flowId = flowId.substring(0, 16); + } + try { + long tmpFlow = Long.parseLong(flowId, 16); + // need to prevent the computed run ID from showing up in the future, + // so we don't "mask" jobs later submitted with the correct property + + // make this show up within the job submit month + long monthStart = DateUtil.getMonthStart(submitTimeMillis); + // this still allows these jobs to show up in the "future", but at least + // constrains to current month + cascadingSubmitTimeMillis = monthStart + (tmpFlow % DateUtil.MONTH_IN_MILLIS); + } catch (NumberFormatException nfe) { + // fall back to the job submit time + cascadingSubmitTimeMillis = submitTimeMillis; + } + } else { + // fall back to the job submit time + cascadingSubmitTimeMillis = submitTimeMillis; + } + } + + return cascadingSubmitTimeMillis; + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/TaskDetails.java b/hraven-core/src/main/java/com/twitter/hraven/TaskDetails.java new file mode 100644 index 0000000..927d94e --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/TaskDetails.java @@ -0,0 +1,259 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.Map; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; +import org.apache.hadoop.hbase.util.Bytes; +import com.twitter.hraven.datasource.JobHistoryService; + +/** + */ +public class TaskDetails implements Comparable { + + private TaskKey taskKey; + + // task-level stats + private String taskId; + private String type; + private String status; + private String[] splits; + private long startTime; + private long finishTime; + + // task-level counters + private CounterMap counters = new CounterMap(); + + // task attempt specific fields + private String taskAttemptId; + private String trackerName; + private int httpPort; + private String hostname; + private String state; + private String error; + private long shuffleFinished; + private long sortFinished; + + public TaskDetails(TaskKey taskKey) { + this.taskKey = taskKey; + } + + public TaskKey getTaskKey() { + return taskKey; + } + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String[] getSplits() { + return splits; + } + + public void setSplits(String[] splits) { + this.splits = splits; + } + + public long getStartTime() { + return startTime; + } + + public void setStartTime(long startTime) { + this.startTime = startTime; + } + + public long getFinishTime() { + return finishTime; + } + + public void setFinishTime(long finishTime) { + this.finishTime = finishTime; + } + + public CounterMap getCounters() { + return this.counters; + } + + /** + * Compares two TaskDetails objects on the basis of their TaskKey + * + * @param other + * @return 0 if this TaskKey is equal to the other TaskKey, + * 1 if this TaskKey greater than other TaskKey, + * -1 if this TaskKey is less than other TaskKey + * + */ + @Override + public int compareTo(TaskDetails otherTask) { + if (otherTask == null) { + return -1; + } + + return new CompareToBuilder().append(this.taskKey, otherTask.getTaskKey()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof TaskDetails ) { + return compareTo((TaskDetails)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.taskKey) + .toHashCode(); + } + + /* *** Task attempt properties *** */ + + public String getTaskAttemptId() { + return taskAttemptId; + } + + public void setTaskAttemptId(String taskAttemptId) { + this.taskAttemptId = taskAttemptId; + } + + public String getTrackerName() { + return trackerName; + } + + public void setTrackerName(String trackerName) { + this.trackerName = trackerName; + } + + public int getHttpPort() { + return httpPort; + } + + public void setHttpPort(int httpPort) { + this.httpPort = httpPort; + } + + public String getHostname() { + return hostname; + } + + public void setHostname(String hostname) { + this.hostname = hostname; + } + + public String getState() { + return state; + } + + public void setState(String state) { + this.state = state; + } + + public String getError() { + return error; + } + + public void setError(String error) { + this.error = error; + } + + public long getShuffleFinished() { + return shuffleFinished; + } + + public void setShuffleFinished(long shuffleFinished) { + this.shuffleFinished = shuffleFinished; + } + + public long getSortFinished() { + return sortFinished; + } + + public void setSortFinished(long sortFinished) { + this.sortFinished = sortFinished; + } + + public void populate(Map taskValues) { + this.taskId = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TASKID))); + this.type = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TASK_TYPE))); + this.status = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TASK_STATUS))); + String taskSplits = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.SPLITS))); + if (taskSplits != null) { + this.splits = taskSplits.split(","); + } + this.startTime = Bytes.toLong( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.START_TIME))); + this.finishTime = Bytes.toLong( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISH_TIME))); + + this.taskAttemptId = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TASK_ATTEMPT_ID))); + this.trackerName = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TRACKER_NAME))); + byte[] httpPortBytes = + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.HTTP_PORT)); + if (httpPortBytes != null) { + this.httpPort = Bytes.toInt(httpPortBytes); + } + this.hostname = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.HOSTNAME))); + this.state = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.STATE_STRING))); + this.error = Bytes.toString( + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.ERROR))); + byte[] shuffleFinishedBytes = + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.SHUFFLE_FINISHED)); + if (shuffleFinishedBytes != null) { + this.shuffleFinished = Bytes.toLong(shuffleFinishedBytes); + } + byte[] sortFinishedBytes = + taskValues.get(JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.SORT_FINISHED)); + if (sortFinishedBytes != null) { + this.sortFinished = Bytes.toLong(sortFinishedBytes); + } + + // populate task counters + this.counters = JobHistoryService.parseCounters( + Constants.COUNTER_COLUMN_PREFIX_BYTES, taskValues); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/TaskKey.java b/hraven-core/src/main/java/com/twitter/hraven/TaskKey.java new file mode 100644 index 0000000..b8355c1 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/TaskKey.java @@ -0,0 +1,82 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.commons.lang.builder.CompareToBuilder; +import org.apache.commons.lang.builder.HashCodeBuilder; + + +/** + * Represents the row key for an individual job task. This key shares all the + * same components from the job key, with the additional of the task ID: + *
+ *   (m|r)_tasknumber(_attemptnumber)?
+ * 
+ */ +//Leaving comparable as a raw due to sub-typing/overriding issues. +@SuppressWarnings("rawtypes") +public class TaskKey extends JobKey implements Comparable { + private String taskId; + + public TaskKey(JobKey jobKey, String taskId) { + super(jobKey.getQualifiedJobId(), jobKey.getUserName(), jobKey.getAppId(), + jobKey.getRunId()); + this.taskId = taskId; + } + + public String getTaskId() { + return this.taskId; + } + + public String toString() { + return new StringBuilder(super.toString()) + .append(Constants.SEP).append(taskId).toString(); + } + + /** + * Compares two TaskKey objects on the basis of their taskId + * + * @param other + * @return 0 if the taskIds are equal, + * 1 if this taskId is greater than other taskId, + * -1 if this taskId is less than other taskId + */ + @Override + public int compareTo(Object other) { + if (other == null) { + return -1; + } + TaskKey otherKey = (TaskKey) other; + return new CompareToBuilder().appendSuper(super.compareTo(otherKey)) + .append(this.taskId, otherKey.getTaskId()) + .toComparison(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof TaskKey) { + return compareTo((TaskKey)other) == 0; + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder().appendSuper(super.hashCode()) + .append(this.taskId) + .toHashCode(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/AppVersionService.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/AppVersionService.java new file mode 100644 index 0000000..9a5a7c4 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/AppVersionService.java @@ -0,0 +1,209 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; + +import com.google.common.collect.Lists; +import com.twitter.hraven.Constants; + +/** + * Reads and writes information about the mapping of application IDs + * to version numbers. + */ +public class AppVersionService { + + private static Log LOG = LogFactory.getLog(AppVersionService.class); + + @SuppressWarnings("unused") + private final Configuration conf; + private final HTable versionsTable; + + public AppVersionService(Configuration conf) throws IOException { + this.conf = conf; + this.versionsTable = new HTable(conf, Constants.HISTORY_APP_VERSION_TABLE); + } + + /** + * Returns the most recent version ID for the given application. + * + * @param cluster + * @param user + * @param appId + * @return the most recent version ID or {@code null} if no versions are found + * @throws IOException + */ + public String getLatestVersion(String cluster, String user, String appId) + throws IOException { + Get get = new Get(getRowKey(cluster, user, appId)); + List versions = Lists.newArrayList(); + Result r = this.versionsTable.get(get); + if (r != null && !r.isEmpty()) { + for (KeyValue kv : r.list()) { + versions.add( + new VersionInfo(Bytes.toString(kv.getQualifier()), Bytes.toLong(kv.getValue())) ); + } + } + + if (versions.size() > 0) { + Collections.sort(versions); + return versions.get(0).getVersion(); + } + + return null; + } + + /** + * Returns the list of distinct versions for the given application + * sorted in reverse chronological order + * + * @param cluster + * @param user + * @param appId + * @return the list of versions sorted in reverse chronological order + * (the list will be empty if no versions are found) + * @throws IOException + */ + public List getDistinctVersions(String cluster, String user, String appId) + throws IOException { + Get get = new Get(getRowKey(cluster, user, appId)); + List versions = Lists.newArrayList(); + Long ts = 0L; + Result r = this.versionsTable.get(get); + if (r != null && !r.isEmpty()) { + for (KeyValue kv : r.list()) { + ts = 0L; + try { + ts = Bytes.toLong(kv.getValue()); + versions.add( + new VersionInfo(Bytes.toString(kv.getQualifier()), ts) ); + } + catch (IllegalArgumentException e1 ) { + // Bytes.toLong may throw IllegalArgumentException, although unlikely. + LOG.error("Caught conversion error while converting timestamp to long value " + + e1.getMessage()); + // rethrow the exception in order to propagate it + throw e1; + } + } + } + + if (versions.size() > 0) { + Collections.sort(versions); + } + + return versions; + } + + /** + * Adds an entry for the given version, if it does not already exist. If the + * given timestamp is earlier than the currently stored timestamp for the version, + * it will be updated. + * + * @param cluster cluster identifier (cluster@identifier) + * @param user user name + * @param appId application identifier + * @param version version identifier + * @param timestamp timestamp to store with this version (only the earliest timestamp is stored) + * @return {@code true} if a new version entry was added, {@code false} + * if the version already existed + */ + public boolean addVersion(String cluster, String user, String appId, + String version, long timestamp) throws IOException { + boolean updated = false; + + // check if the version already exists + byte[] rowKey = getRowKey(cluster, user, appId); + byte[] versionCol = Bytes.toBytes(version); + + int attempts = 0; + // retry up to this many times for checkAndPut failures + int maxAttempts = 3; + boolean checkForUpdate = true; + + while (checkForUpdate && attempts < maxAttempts) { + attempts++; + // values for conditional update + Put p = null; + byte[] expectedValue = null; + + Get get = new Get(rowKey); + get.addColumn(Constants.INFO_FAM_BYTES, versionCol); + Result r = this.versionsTable.get(get); + if (r != null && !r.isEmpty()) { + byte[] storedValue = r.getValue(Constants.INFO_FAM_BYTES, versionCol); + long storedTS = Bytes.toLong(storedValue); + if (timestamp < storedTS) { + // update the stored timestamp to our earlier value + p = new Put(rowKey); + p.add(Constants.INFO_FAM_BYTES, versionCol, Bytes.toBytes(timestamp)); + expectedValue = storedValue; + } else { + // version exists and exceeds our value, no update necessary + checkForUpdate = false; + } + } else { + // no stored value + p = new Put(rowKey); + p.add(Constants.INFO_FAM_BYTES, versionCol, Bytes.toBytes(timestamp)); + } + + if (p != null) { + // we have an updated value to add + updated = this.versionsTable.checkAndPut( + rowKey, Constants.INFO_FAM_BYTES, versionCol, expectedValue, p); + checkForUpdate = !updated; + if (!updated) { + LOG.warn("Update of cluster="+cluster+", user="+user+", app="+appId+ + ", version="+version+" to timestamp "+timestamp+ + " failed because currently set value changed!"+ + " (attempt "+attempts+" of "+maxAttempts+")"); + } + } + } + + return updated; + } + + /** + * Close the underlying HTable reference to free resources + * @throws IOException + */ + public void close() throws IOException { + if (this.versionsTable != null) { + this.versionsTable.close(); + } + } + + private byte[] getRowKey(String cluster, String user, String appId) { + String keyString = new StringBuilder(cluster) + .append(Constants.SEP).append(user) + .append(Constants.SEP).append(appId).toString(); + return Bytes.toBytes(keyString); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/ByteConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/ByteConverter.java new file mode 100644 index 0000000..849ad3e --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/ByteConverter.java @@ -0,0 +1,23 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +/** + */ +public interface ByteConverter { + public byte[] toBytes(T object); + public T fromBytes(byte[] bytes); +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/DataException.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/DataException.java new file mode 100644 index 0000000..10101ef --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/DataException.java @@ -0,0 +1,32 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +/** + * Base exception representing errors in data retrieval or storage. + */ +public class DataException extends Exception { + + private static final long serialVersionUID = 2406302267896675759L; + + public DataException(String message) { + super(message); + } + + public DataException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowEventKeyConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowEventKeyConverter.java new file mode 100644 index 0000000..0f53615 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowEventKeyConverter.java @@ -0,0 +1,59 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.FlowEventKey; +import com.twitter.hraven.util.ByteUtil; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + */ +public class FlowEventKeyConverter implements ByteConverter { + private FlowKeyConverter flowKeyConverter = new FlowKeyConverter(); + + @Override + public byte[] toBytes(FlowEventKey key) { + if (key == null) { + return Constants.EMPTY_BYTES; + } + return ByteUtil.join(Constants.SEP_BYTES, flowKeyConverter.toBytes(key), + Bytes.toBytes(key.getSequence())); + } + + @Override + public FlowEventKey fromBytes(byte[] bytes) { + byte[][] splits = ByteUtil.split(bytes, Constants.SEP_BYTES, 4); + byte[][] flowKeySplits = new byte[4][]; + for (int i=0; i events) throws IOException { + List puts = new ArrayList(events.size()); + for (FlowEvent e : events) { + puts.add(createPutForEvent(e)); + } + eventTable.put(puts); + } + + /** + * Retrieves all the event rows matching a single {@link com.twitter.hraven.Flow}. + * @param flowKey + * @return + */ + public List getFlowEvents(FlowKey flowKey) throws IOException { + byte[] startKey = Bytes.add(flowKeyConverter.toBytes(flowKey), Constants.SEP_BYTES); + Scan scan = new Scan(startKey); + scan.setFilter(new WhileMatchFilter(new PrefixFilter(startKey))); + + List results = new ArrayList(); + ResultScanner scanner = null; + try { + scanner = eventTable.getScanner(scan); + for (Result r : scanner) { + FlowEvent event = createEventFromResult(r); + if (event != null) { + results.add(event); + } + } + } finally { + if (scanner != null) { + scanner.close(); + } + } + return results; + } + + /** + * Retrieves all events added after the given event key (with sequence numbers greater than the + * given key). If no new events are found returns an empty list. + * @param lastSeen + * @return + */ + public List getFlowEventsSince(FlowEventKey lastSeen) throws IOException { + // rows must match the FlowKey portion + SEP + byte[] keyPrefix = Bytes.add(flowKeyConverter.toBytes(lastSeen), Constants.SEP_BYTES); + // start at the next following sequence number + FlowEventKey nextEvent = new FlowEventKey(lastSeen.getCluster(), lastSeen.getUserName(), + lastSeen.getAppId(), lastSeen.getRunId(), lastSeen.getSequence()+1); + byte[] startKey = keyConverter.toBytes(nextEvent); + Scan scan = new Scan(startKey); + scan.setFilter(new WhileMatchFilter(new PrefixFilter(keyPrefix))); + + List results = new ArrayList(); + ResultScanner scanner = null; + try { + scanner = eventTable.getScanner(scan); + for (Result r : scanner) { + FlowEvent event = createEventFromResult(r); + if (event != null) { + results.add(event); + } + } + } finally { + if (scanner != null) { + scanner.close(); + } + } + return results; + } + + protected Put createPutForEvent(FlowEvent event) { + Put p = new Put(keyConverter.toBytes(event.getFlowEventKey())); + p.add(Constants.INFO_FAM_BYTES, TIMESTAMP_COL_BYTES, Bytes.toBytes(event.getTimestamp())); + if (event.getType() != null) { + p.add(Constants.INFO_FAM_BYTES, TYPE_COL_BYTES, Bytes.toBytes(event.getType())); + } + if (event.getFramework() != null) { + p.add(Constants.INFO_FAM_BYTES, Constants.FRAMEWORK_COLUMN_BYTES, + Bytes.toBytes(event.getFramework().getCode())); + } + if (event.getEventDataJSON() != null) { + p.add(Constants.INFO_FAM_BYTES, DATA_COL_BYTES, Bytes.toBytes(event.getEventDataJSON())); + } + return p; + } + + protected FlowEvent createEventFromResult(Result result) { + if (result == null || result.isEmpty()) { + return null; + } + FlowEventKey key = keyConverter.fromBytes(result.getRow()); + FlowEvent event = new FlowEvent(key); + if (result.containsColumn(Constants.INFO_FAM_BYTES, TIMESTAMP_COL_BYTES)) { + event.setTimestamp(Bytes.toLong( + result.getValue(Constants.INFO_FAM_BYTES, TIMESTAMP_COL_BYTES))); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, TYPE_COL_BYTES)) { + event.setType(Bytes.toString(result.getValue(Constants.INFO_FAM_BYTES, TYPE_COL_BYTES))); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, Constants.FRAMEWORK_COLUMN_BYTES)) { + String code = Bytes.toString(result.getValue( + Constants.INFO_FAM_BYTES, Constants.FRAMEWORK_COLUMN_BYTES)); + event.setFramework(Framework.get(code)); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, DATA_COL_BYTES)) { + event.setEventDataJSON(Bytes.toString( + result.getValue(Constants.INFO_FAM_BYTES, DATA_COL_BYTES))); + } + return event; + } + + public void close() throws IOException { + this.eventTable.close(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowKeyConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowKeyConverter.java new file mode 100644 index 0000000..c0fbfa1 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowKeyConverter.java @@ -0,0 +1,53 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.FlowKey; +import com.twitter.hraven.util.ByteUtil; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + */ +public class FlowKeyConverter implements ByteConverter { + + @Override + public byte[] toBytes(FlowKey flowKey) { + if (flowKey == null) { + return Constants.EMPTY_BYTES; + } else { + return ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(flowKey.getCluster()), + Bytes.toBytes(flowKey.getUserName()), + Bytes.toBytes(flowKey.getAppId()), + Bytes.toBytes(flowKey.getEncodedRunId())); + } + } + + @Override + public FlowKey fromBytes(byte[] bytes) { + return fromBytes(ByteUtil.split(bytes, Constants.SEP_BYTES, 4)); + } + + public FlowKey fromBytes(byte[][] splitBytes) { + long runId = splitBytes.length > 3 ? Long.MAX_VALUE - Bytes.toLong(splitBytes[3]) : 0; + return new FlowKey( Bytes.toString(splitBytes[0]), + splitBytes.length > 1 ? Bytes.toString(splitBytes[1]) : null, + splitBytes.length > 2 ? Bytes.toString(splitBytes[2]) : null, + runId); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueKeyConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueKeyConverter.java new file mode 100644 index 0000000..9628fab --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueKeyConverter.java @@ -0,0 +1,63 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.Flow; +import com.twitter.hraven.FlowQueueKey; +import com.twitter.hraven.util.ByteUtil; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Handles serialization and deserialization of a {@link FlowQueueKey} to and from bytes. + */ +public class FlowQueueKeyConverter implements ByteConverter { + @Override + public byte[] toBytes(FlowQueueKey key) { + if (key == null) { + return Constants.EMPTY_BYTES; + } + long invertedTimestamp = Long.MAX_VALUE - key.getTimestamp(); + return ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(key.getCluster()), + (key.getStatus() == null ? Constants.EMPTY_BYTES : key.getStatus().code()), + Bytes.toBytes(invertedTimestamp), + Bytes.toBytes(key.getFlowId())); + } + + @Override + public FlowQueueKey fromBytes(byte[] bytes) { + if (bytes == null) { + return null; + } + + byte[][] firstSplit = ByteUtil.split(bytes, Constants.SEP_BYTES, 3); + byte[] timestampBytes = null; + byte[] flowIdBytes = null; + if (firstSplit.length == 3) { + int offset = 0; + timestampBytes = ByteUtil.safeCopy(firstSplit[2], 0, 8); + offset += 8+Constants.SEP_BYTES.length; + flowIdBytes = ByteUtil.safeCopy(firstSplit[2], offset, firstSplit[2].length - offset); + } + + return new FlowQueueKey(Bytes.toString(firstSplit[0]), + firstSplit.length > 1 ? Flow.STATUS_BY_CODE.get(firstSplit[1]) : null, + timestampBytes != null ? Long.MAX_VALUE - Bytes.toLong(timestampBytes) : 0, + flowIdBytes != null ? Bytes.toString(flowIdBytes) : null); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueService.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueService.java new file mode 100644 index 0000000..15a0ec7 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/FlowQueueService.java @@ -0,0 +1,200 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.Flow; +import com.twitter.hraven.FlowKey; +import com.twitter.hraven.FlowQueueKey; +import com.twitter.hraven.util.ByteUtil; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.PrefixFilter; +import org.apache.hadoop.hbase.filter.WhileMatchFilter; +import org.apache.hadoop.hbase.util.Bytes; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + */ +public class FlowQueueService { + /* Constants for column names */ + public static final String JOB_GRAPH_COL = "dag"; + public static final byte[] JOB_GRAPH_COL_BYTES = Bytes.toBytes(JOB_GRAPH_COL); + public static final String FLOW_NAME_COL = "flowname"; + public static final byte[] FLOW_NAME_COL_BYTES = Bytes.toBytes(FLOW_NAME_COL); + public static final String USER_NAME_COL = "username"; + public static final byte[] USER_NAME_COL_BYTES = Bytes.toBytes(USER_NAME_COL); + public static final String PROGRESS_COL = "progress"; + public static final byte[] PROGRESS_COL_BYTES = Bytes.toBytes(PROGRESS_COL); + + private FlowQueueKeyConverter queueKeyConverter = new FlowQueueKeyConverter(); + private FlowKeyConverter flowKeyConverter = new FlowKeyConverter(); + + private HTable flowQueueTable; + + public FlowQueueService(Configuration conf) throws IOException { + this.flowQueueTable = new HTable(conf, Constants.FLOW_QUEUE_TABLE_BYTES); + } + + public void updateFlow(FlowQueueKey key, Flow flow) throws IOException { + Put p = createPutForFlow(key, flow); + flowQueueTable.put(p); + } + + /** + * Moves a flow_queue record from one row key to another. All KeyValues in the existing row + * will be written to the new row. This would primarily be used for transitioning a flow's + * data from one status to another. + * + * @param oldKey the existing row key to move + * @param newKey the new row key to move to + * @throws IOException + */ + public void moveFlow(FlowQueueKey oldKey, FlowQueueKey newKey) + throws DataException, IOException { + byte[] oldRowKey = queueKeyConverter.toBytes(oldKey); + Get get = new Get(oldRowKey); + Result result = flowQueueTable.get(get); + if (result == null || result.isEmpty()) { + // no existing row + throw new DataException("No row for key "+ Bytes.toStringBinary(oldRowKey)); + } + // copy the existing row to the new key + Put p = new Put(queueKeyConverter.toBytes(newKey)); + for (KeyValue kv : result.raw()) { + p.add(kv.getFamily(), kv.getQualifier(), kv.getValue()); + } + flowQueueTable.put(p); + // delete the old row + Delete d = new Delete(oldRowKey); + flowQueueTable.delete(d); + } + + protected Put createPutForFlow(FlowQueueKey key, Flow flow) { + Put p = new Put(queueKeyConverter.toBytes(key)); + if (flow.getFlowKey() != null) { + p.add(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES, + flowKeyConverter.toBytes(flow.getFlowKey())); + } + if (flow.getJobGraphJSON() != null) { + p.add(Constants.INFO_FAM_BYTES, JOB_GRAPH_COL_BYTES, Bytes.toBytes(flow.getJobGraphJSON())); + } + if (flow.getFlowName() != null) { + p.add(Constants.INFO_FAM_BYTES, FLOW_NAME_COL_BYTES, Bytes.toBytes(flow.getFlowName())); + } + if (flow.getUserName() != null) { + p.add(Constants.INFO_FAM_BYTES, USER_NAME_COL_BYTES, Bytes.toBytes(flow.getUserName())); + } + p.add(Constants.INFO_FAM_BYTES, PROGRESS_COL_BYTES, Bytes.toBytes(flow.getProgress())); + return p; + } + + public Flow getFlowFromQueue(String cluster, long timestamp, String flowId) throws IOException { + // since flow_queue rows can transition status, we check all at once + List gets = new ArrayList(); + for (Flow.Status status : Flow.Status.values()) { + FlowQueueKey key = new FlowQueueKey(cluster, status, timestamp, flowId); + gets.add(new Get(queueKeyConverter.toBytes(key))); + } + Result[] results = flowQueueTable.get(gets); + Flow flow = null; + for (Result r : results) { + flow = createFlowFromResult(r); + if (flow != null) { + break; + } + } + return flow; + } + + public List getFlowsForStatus(String cluster, Flow.Status status, int limit) + throws IOException { + byte[] startRow = ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(cluster), status.code(), Constants.EMPTY_BYTES); + Scan scan = new Scan(startRow); + scan.setFilter(new WhileMatchFilter(new PrefixFilter(startRow))); + + List results = new ArrayList(limit); + ResultScanner scanner = null; + try { + scanner = flowQueueTable.getScanner(scan); + int cnt = 0; + for (Result r : scanner) { + Flow flow = createFlowFromResult(r); + if (flow != null) { + cnt++; + results.add(flow); + } + if (cnt >= limit) { + break; + } + } + } finally { + if (scanner != null) { + scanner.close(); + } + } + return results; + } + + protected Flow createFlowFromResult(Result result) { + if (result == null || result.isEmpty()) { + return null; + } + FlowQueueKey queueKey = queueKeyConverter.fromBytes(result.getRow()); + FlowKey flowKey = null; + // when flow is first being launched FlowKey may not yet be present + if (result.containsColumn(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES)) { + flowKey = flowKeyConverter.fromBytes( + result.getValue(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES)); + } + Flow flow = new Flow(flowKey); + flow.setFlowQueueKey(queueKey); + if (result.containsColumn(Constants.INFO_FAM_BYTES, JOB_GRAPH_COL_BYTES)) { + flow.setJobGraphJSON( + Bytes.toString(result.getValue(Constants.INFO_FAM_BYTES, JOB_GRAPH_COL_BYTES))); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, FLOW_NAME_COL_BYTES)) { + flow.setFlowName( + Bytes.toString(result.getValue(Constants.INFO_FAM_BYTES, FLOW_NAME_COL_BYTES))); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, USER_NAME_COL_BYTES)) { + flow.setUserName( + Bytes.toString(result.getValue(Constants.INFO_FAM_BYTES, USER_NAME_COL_BYTES))); + } + if (result.containsColumn(Constants.INFO_FAM_BYTES, PROGRESS_COL_BYTES)) { + flow.setProgress(Bytes.toInt(result.getValue(Constants.INFO_FAM_BYTES, PROGRESS_COL_BYTES))); + } + return flow; + } + + public void close() throws IOException { + if (this.flowQueueTable != null) { + this.flowQueueTable.close(); + } + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryByIdService.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryByIdService.java new file mode 100644 index 0000000..328d772 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryByIdService.java @@ -0,0 +1,104 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.QualifiedJobId; + +/** + * Service to access the {@link Constants#HISTORY_BY_JOBID_TABLE}. + * + */ +public class JobHistoryByIdService { + private JobKeyConverter jobKeyConv = new JobKeyConverter(); + private QualifiedJobIdConverter jobIdConv = new QualifiedJobIdConverter(); + + /** + * Used to store the job to jobHistoryKey index in. + */ + private final HTable historyByJobIdTable; + + public JobHistoryByIdService(Configuration myHBaseConf) throws IOException { + historyByJobIdTable = new HTable(myHBaseConf, + Constants.HISTORY_BY_JOBID_TABLE_BYTES); + } + + /** + * Release internal HBase table instances. Must be called when consumer is + * done with this service. + * + * @throws IOException + * when bad things happen closing HBase table(s). + */ + public void close() throws IOException { + if (historyByJobIdTable != null) { + historyByJobIdTable.close(); + } + } + + /** + * Returns the JobKey for the job_history table, stored for this job ID, + * or {@code null} if not found. + * @param jobId the cluster and job ID combination to look up + * @return the JobKey instance stored, or {@code null} if not found + * @throws IOException if thrown by the HBase client + */ + public JobKey getJobKeyById(QualifiedJobId jobId) throws IOException { + byte[] indexKey = jobIdConv.toBytes(jobId); + + Get g = new Get(indexKey); + g.addColumn(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES); + Result r = historyByJobIdTable.get(g); + if (r != null && !r.isEmpty()) { + byte[] historyKey = r.getValue(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES); + if (historyKey != null && historyKey.length > 0) { + return jobKeyConv.fromBytes(historyKey); + } + } + return null; + } + + /** + * Create the secondary indexes records cluster!jobId->jobKey. + * + * @param jobKey + * @throws IOException + * if the entry cannot be written. + */ + public void writeIndexes(JobKey jobKey) throws IOException { + // Defensive coding + if (jobKey != null) { + byte[] jobKeyBytes = jobKeyConv.toBytes(jobKey); + byte[] rowKeyBytes = jobIdConv.toBytes( + new QualifiedJobId(jobKey.getCluster(), jobKey.getJobId()) ); + + // Insert (or update) row with jobid as the key + Put p = new Put(rowKeyBytes); + p.add(Constants.INFO_FAM_BYTES, Constants.ROWKEY_COL_BYTES, jobKeyBytes); + historyByJobIdTable.put(p); + } + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryRawService.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryRawService.java new file mode 100644 index 0000000..5303bf2 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryRawService.java @@ -0,0 +1,580 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.LinkedList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.*; +import org.apache.hadoop.hbase.filter.CompareFilter; +import org.apache.hadoop.hbase.filter.FilterList; +import org.apache.hadoop.hbase.filter.InclusiveStopFilter; +import org.apache.hadoop.hbase.filter.PrefixFilter; +import org.apache.hadoop.hbase.filter.SingleColumnValueExcludeFilter; +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobId; +import com.twitter.hraven.QualifiedJobId; +import com.twitter.hraven.Range; +import com.twitter.hraven.util.BatchUtil; +import com.twitter.hraven.util.ByteUtil; + +/** + * Used to store and retrieve {@link ProcessRecord} objects. + */ +public class JobHistoryRawService { + private static Log LOG = LogFactory.getLog(JobHistoryRawService.class); + + private QualifiedJobIdConverter idConv = new QualifiedJobIdConverter(); + + /** + * Used to store the processRecords in HBase + */ + private final HTable rawTable; + + /** + * Constructor. Note that caller is responsible to {@link #close()} created + * instances. + * + * @param myHBaseConf + * configuration of the processing job, not the conf of the files we + * are processing. Used to connect to HBase. + * @throws IOException + * in case we have problems connecting to HBase. + */ + public JobHistoryRawService(Configuration myHBaseConf) throws IOException { + rawTable = new HTable(myHBaseConf, Constants.HISTORY_RAW_TABLE_BYTES); + } + + /** + * Given a min and max jobId, get a {@link Scan} to go through all the records + * loaded in the {@link Constants#HISTORY_RAW_TABLE}, get all the rowkeys and + * create a list of scans with batchSize number of rows in the rawTable. + *

+ * Note that this can be a somewhat slow operation as the + * {@link Constants#HISTORY_RAW_TABLE} will have to be scanned. + * + * @param cluster + * on which the Hadoop jobs ran. + * @param minJobId + * used to start the scan. If null then there is no min limit on + * JobId. + * @param maxJobId + * used to end the scan (inclusive). If null then there is no max + * limit on jobId. + * @param reprocess + * Reprocess those records that may have been processed already. + * Otherwise successfully processed jobs are skipped. + * @param reloadOnly + * load only those raw records that were marked to be reloaded using + * {@link #markJobForReprocesssing(QualifiedJobId)} + * @return a scan of jobIds between the specified min and max. Retrieves only + * one version of each column. + * @throws IOException + * @throws RowKeyParseException + * when rows returned from the Raw table do not conform to the + * expected row key. + */ + public List getHistoryRawTableScans(String cluster, String minJobId, + String maxJobId, boolean reprocess, int batchSize) throws IOException, + RowKeyParseException { + + List scans = new LinkedList(); + + // Get all the values in the scan so that we can evenly chop them into + // batch size chunks. + // The problem is that processRecords min and max can have vastly + // overlapping ranges, and in addition, they may have a minJobId of a long + // running Hadoop job that is processed much later. Many jobIds that are + // of shorter jobs that have already been processed will in between the + // min and max, but since the scan returns only the records that are not + // already processed, the returned list may have large gaps. + Scan scan = getHistoryRawTableScan(cluster, minJobId, maxJobId, reprocess, + false); + + SortedSet orderedJobIds = new TreeSet(); + + ResultScanner scanner = null; + try { + LOG.info("Scanning " + Constants.HISTORY_RAW_TABLE + " table from " + + minJobId + " to " + maxJobId); + scanner = rawTable.getScanner(scan); + for (Result result : scanner) { + JobId qualifiedJobId = getQualifiedJobIdFromResult(result); + orderedJobIds.add(qualifiedJobId); + } + } finally { + if (scanner != null) { + scanner.close(); + } + } + + // Now chop the set into chunks. + List> ranges = BatchUtil.getRanges(orderedJobIds, batchSize); + LOG.info("Dividing " + orderedJobIds.size() + " jobs in " + ranges.size() + + " ranges."); + + for (Range range : ranges) { + Scan rawScan = getHistoryRawTableScan(cluster, range.getMin() + .getJobIdString(), range.getMax().getJobIdString(), reprocess, true); + scans.add(rawScan); + } + + return scans; + } + + /** + * Get a {@link Scan} to go through all the records loaded in the + * {@link Constants#HISTORY_RAW_TABLE} that match the given parameters. + * + * @param cluster + * on which the Hadoop jobs ran. + * @param minJobId + * used to start the scan. If null then there is no min limit on + * JobId. + * @param maxJobId + * used to end the scan (inclusive). If null then there is no max + * limit on jobId. + * @param reprocess + * return only those raw records that were marked to be reprocessed + * using {@link #markJobForReprocesssing(QualifiedJobId)}. Otherwise + * successfully processed jobs are skipped. + * @param reprocessOnly + * When true then reprocess argument is ignored and is assumed to be + * true. + * @param includeRaw + * whether to include the raw column family in the scan results. + * @return a scan of jobIds between the specified min and max. Retrieves only + * one version of each column. + */ + public Scan getHistoryRawTableScan(String cluster, String minJobId, + String maxJobId, boolean reprocess, boolean includeRaw) { + Scan scan = new Scan(); + + LOG.info("Creating scan for cluster: " + cluster); + + // Add the columns to be pulled back by this scan. + scan.addFamily(Constants.INFO_FAM_BYTES); + if (includeRaw) { + scan.addFamily(Constants.RAW_FAM_BYTES); + } + + // Pull data only for our cluster + byte[] clusterPrefix = Bytes.toBytes(cluster + Constants.SEP); + byte[] startRow; + if (minJobId == null) { + startRow = clusterPrefix; + } else { + startRow = idConv.toBytes(new QualifiedJobId(cluster, minJobId)); + } + scan.setStartRow(startRow); + + LOG.info("Starting raw table scan at " + Bytes.toStringBinary(startRow)); + + FilterList filters = new FilterList(FilterList.Operator.MUST_PASS_ALL); + + // Scan only those raw rows for the specified cluster. + PrefixFilter prefixFilter = new PrefixFilter(clusterPrefix); + filters.addFilter(prefixFilter); + + byte[] stopRow; + // Set to stop the scan once the last row is encountered. + if (maxJobId != null) { + // The inclusive stop filter actually is the accurate representation of + // what needs to be in the result. + byte[] lastRow = idConv.toBytes(new QualifiedJobId(cluster, maxJobId)); + InclusiveStopFilter inclusiveStopFilter = new InclusiveStopFilter(lastRow); + filters.addFilter(inclusiveStopFilter); + LOG.info("Stopping raw table scan (stop filter) at " + + Bytes.toStringBinary(lastRow)); + + // Add one to the jobSequence of the maximum JobId. + JobId maximumJobId = new JobId(maxJobId); + JobId oneBiggerThanMaxJobId = new JobId(maximumJobId.getJobEpoch(), + maximumJobId.getJobSequence() + 1); + stopRow = idConv.toBytes(new QualifiedJobId(cluster, + oneBiggerThanMaxJobId)); + + } else { + char oneBiggerSep = (char) (Constants.SEP_CHAR + 1); + stopRow = Bytes.toBytes(cluster + oneBiggerSep); + } + // In addition to InclusiveStopRowFilter, set an estimated end-row that is + // guaranteed to be bigger than the last row we want (but may over-shoot a + // little). This helps the TableInput format limit the number of regions + // (-servers) that need to be targeted for this scan. + scan.setStopRow(stopRow); + LOG.info("Stopping raw table scan (stop row) at " + + Bytes.toStringBinary(stopRow)); + + scan.setFilter(filters); + + if (reprocess) { + SingleColumnValueExcludeFilter columnValueFilter = new SingleColumnValueExcludeFilter( + Constants.INFO_FAM_BYTES, Constants.RAW_COL_REPROCESS_BYTES, + CompareFilter.CompareOp.EQUAL, Bytes.toBytes(true)); + columnValueFilter.setFilterIfMissing(true); + filters.addFilter(columnValueFilter); + } else { + // Process each row only once. If it is already processed, then do not do + // it again. + SingleColumnValueExcludeFilter columnValueFilter = new SingleColumnValueExcludeFilter( + Constants.INFO_FAM_BYTES, Constants.JOB_PROCESSED_SUCCESS_COL_BYTES, + CompareFilter.CompareOp.NOT_EQUAL, Bytes.toBytes(true)); + filters.addFilter(columnValueFilter); + } + + /* + * Create a filter that passes only if both the jobconf and job history + * blobs are present. Here we use the last mod time columns as a proxy for + * their existence, since the byte comparison is much cheaper (requires an + * array copy). We do not want to process rows where we don't have both. + */ + byte[] empty = Bytes.toBytes(0L); + FilterList bothColumnFilters = new FilterList( + FilterList.Operator.MUST_PASS_ALL); + SingleColumnValueFilter jobConfFilter = new SingleColumnValueFilter( + Constants.INFO_FAM_BYTES, Constants.JOBCONF_LAST_MODIFIED_COL_BYTES, + CompareFilter.CompareOp.GREATER, empty); + jobConfFilter.setFilterIfMissing(true); + bothColumnFilters.addFilter(jobConfFilter); + SingleColumnValueFilter jobHistoryFilter = new SingleColumnValueFilter( + Constants.INFO_FAM_BYTES, Constants.JOBHISTORY_LAST_MODIFIED_COL_BYTES, + CompareFilter.CompareOp.GREATER, empty); + jobHistoryFilter.setFilterIfMissing(true); + bothColumnFilters.addFilter(jobHistoryFilter); + + filters.addFilter(bothColumnFilters); + + scan.setFilter(filters); + + // Let's be nice; we are reading potentially large amounts of data that + // could take a bit to process. + scan.setCacheBlocks(false); + scan.setCaching(1); + + scan.setMaxVersions(1); + + return scan; + } + + /** + * Returns the raw job configuration stored for the given cluster and job ID + * @param jobId the cluster and job ID to look up + * @return the stored job configuration + * @throws IOException + */ + public Configuration getRawJobConfiguration(QualifiedJobId jobId) throws IOException { + Configuration conf = null; + byte[] rowKey = idConv.toBytes(jobId); + Get get = new Get(rowKey); + get.addColumn(Constants.RAW_FAM_BYTES, Constants.JOBCONF_COL_BYTES); + try { + Result result = rawTable.get(get); + if (result != null && !result.isEmpty()) { + conf = createConfigurationFromResult(result); + } + } catch (MissingColumnInResultException e) { + LOG.error("Failed to retrieve configuration from row returned for "+jobId, e); + } + return conf; + } + + /** + * Returns the raw job history file stored for the given cluster and job ID. + * @param jobId the cluster and job ID to look up + * @return the stored job history file contents or {@code null} if no corresponding record was found + * @throws IOException + */ + public String getRawJobHistory(QualifiedJobId jobId) throws IOException { + String historyData = null; + byte[] rowKey = idConv.toBytes(jobId); + Get get = new Get(rowKey); + get.addColumn(Constants.RAW_FAM_BYTES, Constants.JOBHISTORY_COL_BYTES); + Result result = rawTable.get(get); + if (result != null && !result.isEmpty()) { + historyData = Bytes.toString( + result.getValue(Constants.RAW_FAM_BYTES, Constants.JOBHISTORY_COL_BYTES)); + } + return historyData; + } + + /** + * @param result + * from the {@link Scan} from + * {@link #getHistoryRawTableScan(String, String, String, boolean, boolean, boolean)} + * @return the configuration part. + * @throws MissingColumnInResultException + * when the result does not contain {@link Constants#RAW_FAM}, + * {@link Constants#JOBCONF_COL}. + */ + public Configuration createConfigurationFromResult(Result result) + throws MissingColumnInResultException { + + if (result == null) { + throw new IllegalArgumentException("Cannot create InputStream from null"); + } + + KeyValue keyValue = result.getColumnLatest(Constants.RAW_FAM_BYTES, + Constants.JOBCONF_COL_BYTES); + + // Create a jobConf from the raw input + Configuration jobConf = new Configuration(false); + + byte[] jobConfRawBytes = null; + if (keyValue != null) { + jobConfRawBytes = keyValue.getValue(); + } + if (jobConfRawBytes == null || jobConfRawBytes.length == 0) { + throw new MissingColumnInResultException(Constants.RAW_FAM_BYTES, + Constants.JOBCONF_COL_BYTES); + } + + InputStream in = new ByteArrayInputStream(jobConfRawBytes); + jobConf.addResource(in); + + // Configuration property loading is lazy, so we need to force a load from the input stream + try { + int size = jobConf.size(); + if (LOG.isDebugEnabled()) { + LOG.info("Loaded "+size+" job configuration properties from result"); + } + } catch (Exception e) { + throw new ProcessingException("Invalid configuration from result " + + Bytes.toStringBinary(result.getRow()), e); + } + + return jobConf; + } + + /** + * @param cluster + * the identifier for the Hadoop cluster on which a job ran + * @param jobId + * the identifier of the job as run on the JobTracker. + * @return the rowKey used in the JobHistory Raw table. + */ + public byte[] getRowKey(String cluster, String jobId) { + return idConv.toBytes(new QualifiedJobId(cluster, jobId)); + } + + /** + * Given a result from the {@link Scan} obtained by + * {@link #getHistoryRawTableScan(String, String, String, boolean, boolean, boolean)} + * . They for puts into the raw table are constructed using + * {@link #getRowKey(String, String)} + * + * @param result + * from which to pull the jobKey + * @return the qualified job identifier + * @throws RowKeyParseException + * if the rowkey cannot be parsed properly + */ + public QualifiedJobId getQualifiedJobIdFromResult(Result result) + throws RowKeyParseException { + + if (result == null) { + throw new RowKeyParseException( + "Cannot parse empty row key from result in HBase table: " + + Constants.HISTORY_RAW_TABLE); + } + return idConv.fromBytes(result.getRow()); + } + + /** + * @param result + * from the {@link Scan} from + * {@link #getHistoryRawTableScan(String, String, String, boolean, boolean, boolean)} + * this cannot be null; + * @return an inputStream from the JobHistory + * @throws MissingColumnInResultException + * when the result does not contain {@link Constants#RAW_FAM}, + * {@link Constants#JOBHISTORY_COL}. + */ + public InputStream getJobHistoryInputStreamFromResult(Result result) + throws MissingColumnInResultException { + + if (result == null) { + throw new IllegalArgumentException("Cannot create InputStream from null"); + } + + KeyValue keyValue = result.getColumnLatest(Constants.RAW_FAM_BYTES, + Constants.JOBHISTORY_COL_BYTES); + + byte[] jobHistoryRaw = null; + if (keyValue == null) { + throw new MissingColumnInResultException(Constants.RAW_FAM_BYTES, + Constants.JOBHISTORY_COL_BYTES); + } else { + jobHistoryRaw = keyValue.getValue(); + } + + InputStream is = new ByteArrayInputStream(jobHistoryRaw); + return is; + } + + /** + * Release internal HBase table instances. Must be called when consumer is + * done with this service. + * + * @throws IOException + * when bad things happen closing HBase table(s). + */ + public void close() throws IOException { + if (rawTable != null) { + rawTable.close(); + } + } + + /** + * @param result + * from the {@link Scan} from + * {@link #getHistoryRawTableScan(String, String, String, boolean, boolean, boolean)} + * this cannot be null; + * @return the job submit time in milliseconds since January 1, 1970 UTC; or 0 + * if not value can be found + * @throws MissingColumnInResultException + * when the result does not contain {@link Constants#RAW_FAM}, + * {@link Constants#JOBHISTORY_COL}. + */ + public long getSubmitTimeMillisFromResult(Result result) + throws MissingColumnInResultException { + + if (result == null) { + throw new IllegalArgumentException("Cannot create InputStream from null"); + } + + KeyValue keyValue = result.getColumnLatest(Constants.RAW_FAM_BYTES, + Constants.JOBHISTORY_COL_BYTES); + + // Could be that there is no conf file (only a history file). + if (keyValue == null) { + throw new MissingColumnInResultException(Constants.RAW_FAM_BYTES, + Constants.JOBHISTORY_COL_BYTES); + } + + byte[] jobHistoryRaw = keyValue.getValue(); + + return getSubmitTimeMillisFromJobHistory(jobHistoryRaw); + + } + + /** + * Not for public use, package private for testing purposes. + * + * @param jobHistoryRaw + * from which to pull the SUBMIT_TIME + * @return the job submit time in milliseconds since January 1, 1970 UTC; or 0 + * if no value can be found. + * + */ + static long getSubmitTimeMillisFromJobHistory(byte[] jobHistoryRaw) { + + long submitTimeMillis = 0; + + // The start of the history file looks like this: + // Meta VERSION="1" . + // Job JOBID="job_20120101120000_12345" JOBNAME="..." + // USER="username" SUBMIT_TIME="1339063492288" JOBCONF=" + + // First we look for the first occurrence of SUBMIT_TIME=" + // Then we find the place of the next close quote " + // Then our value is in between those two if valid at all. + + if (null == jobHistoryRaw) { + return submitTimeMillis; + } + + int startIndex = ByteUtil.indexOf(jobHistoryRaw, + Constants.SUBMIT_TIME_PREFIX_BYTES, 0); + if (startIndex != -1) { + int prefixEndIndex = startIndex + + Constants.SUBMIT_TIME_PREFIX_BYTES.length; + + // Find close quote in the snippet, start looking where the prefix ends. + int secondQuoteIndex = ByteUtil.indexOf(jobHistoryRaw, + Constants.QUOTE_BYTES, prefixEndIndex); + if (secondQuoteIndex != -1) { + int numberLength = secondQuoteIndex - prefixEndIndex; + String submitTimeMillisString = Bytes.toString(jobHistoryRaw, + prefixEndIndex, numberLength); + try { + submitTimeMillis = Long.parseLong(submitTimeMillisString); + } catch (NumberFormatException nfe) { + submitTimeMillis = 0; + } + } + } + + return submitTimeMillis; + } + + /** + * @param row + * the identifier of the row in the RAW table. Cannot be null. + * @success whether the job was processed successfully or not + * @return a put to indicate that this job has been processed successfully. + */ + public Put getJobProcessedSuccessPut(byte[] row, boolean success) { + Put put = new Put(row); + put.add(Constants.INFO_FAM_BYTES, + Constants.JOB_PROCESSED_SUCCESS_COL_BYTES, Bytes.toBytes(success)); + if (success) { + // Make sure we mark that this row does not have to be reloaded, no matter + // if it is the first time, or it was marked with reload before. + put.add(Constants.INFO_FAM_BYTES, Constants.RAW_COL_REPROCESS_BYTES, + Bytes.toBytes(false)); + } + return put; + } + + /** + * @param row + * the identifier for the row in the RAW table. Cannot be null. + * @param submitTimeMillis + * @return + */ + public Put getJobSubmitTimePut(byte[] row, long submitTimeMillis) { + Put put = new Put(row); + put.add(Constants.INFO_FAM_BYTES, Constants.SUBMIT_TIME_COL_BYTES, + Bytes.toBytes(submitTimeMillis)); + return put; + } + + /** + * Flags a job's RAW record for reprocessing + * + * @param jobId + */ + public void markJobForReprocesssing(QualifiedJobId jobId) throws IOException { + Put p = new Put(idConv.toBytes(jobId)); + p.add(Constants.INFO_FAM_BYTES, Constants.RAW_COL_REPROCESS_BYTES, + Bytes.toBytes(true)); + + rawTable.put(p); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryService.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryService.java new file mode 100644 index 0000000..19f5548 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobHistoryService.java @@ -0,0 +1,711 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import com.google.common.base.Stopwatch; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.BinaryPrefixComparator; +import org.apache.hadoop.hbase.filter.CompareFilter; +import org.apache.hadoop.hbase.filter.Filter; +import org.apache.hadoop.hbase.filter.FilterList; +import org.apache.hadoop.hbase.filter.PrefixFilter; +import org.apache.hadoop.hbase.filter.QualifierFilter; +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; +import org.apache.hadoop.hbase.filter.WhileMatchFilter; +import org.apache.hadoop.hbase.util.Bytes; +import com.twitter.hraven.JobHistoryKeys; +import com.twitter.hraven.*; +import com.twitter.hraven.util.ByteUtil; + +/** + */ +public class JobHistoryService { + private static Log LOG = LogFactory.getLog(JobHistoryService.class); + + private final Configuration myConf; + private final HTable historyTable; + private final HTable taskTable; + private final JobHistoryByIdService idService; + private final JobKeyConverter jobKeyConv = new JobKeyConverter(); + private final TaskKeyConverter taskKeyConv = new TaskKeyConverter(); + + public JobHistoryService(Configuration myConf) throws IOException { + this.myConf = myConf; + this.historyTable = new HTable(myConf, Constants.HISTORY_TABLE_BYTES); + this.taskTable = new HTable(myConf, Constants.HISTORY_TASK_TABLE_BYTES); + this.idService = new JobHistoryByIdService(this.myConf); + } + + /** + * Returns the most recent flow by application ID. This version will only + * populate job-level details not task information. To include task details + * use + * {@link JobHistoryService#getLatestFlow(String, String, String, boolean)}. + * + * @param cluster the cluster identifier + * @param user the user running the jobs + * @param appId the application description + * @return + */ + public Flow getLatestFlow(String cluster, String user, String appId) + throws IOException { + return getLatestFlow(cluster, user, appId, false); + } + + /** + * Returns the most recent flow by application ID. This version will populate + * both job-level for all jobs in the flow, and task-level data for each job. + * + * @param cluster the cluster identifier + * @param user the user running the jobs + * @param appId the application description + * @return + */ + public Flow getLatestFlow(String cluster, String user, String appId, + boolean populateTasks) throws IOException { + List flows = getFlowSeries(cluster, user, appId, null, populateTasks, + 1); + if (flows.size() > 0) { + return flows.get(0); + } + return null; + } + + /** + * Returns up to {@code limit} most recent flows by application ID. This + * version will only populate job-level details not task information. To + * include task details use + * {@link JobHistoryService#getFlowSeries(String, String, String, String, boolean, int)} + * . + * + * @param cluster the cluster identifier + * @param user the user running the jobs + * @param appId the application description + * @param limit the maximum number of Flow instances to return + * @return + */ + public List getFlowSeries(String cluster, String user, String appId, + int limit) throws IOException { + return getFlowSeries(cluster, user, appId, null, false, limit); + } + + /** + * Returns the {@link Flow} instance matching the application ID and run ID. + * + * @param cluster the cluster identifier + * @param user the user running the jobs + * @param appId the application description + * @param runId the specific run ID for the flow + * @param populateTasks whether or not to populate the task details for each job + * @return + */ + public Flow getFlow(String cluster, String user, String appId, long runId, boolean populateTasks) + throws IOException { + Flow flow = null; + + byte[] startRow = ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(cluster), Bytes.toBytes(user), Bytes.toBytes(appId), + Bytes.toBytes(FlowKey.encodeRunId(runId)), Constants.EMPTY_BYTES); + + LOG.info("Reading job_history rows start at " + Bytes.toStringBinary(startRow)); + Scan scan = new Scan(); + // start scanning history at cluster!user!app!run! + scan.setStartRow(startRow); + // require that all results match this flow prefix + scan.setFilter(new WhileMatchFilter(new PrefixFilter(startRow))); + + List flows = createFromResults(scan, populateTasks, 1); + if (flows.size() > 0) { + flow = flows.get(0); + } + + return flow; + } + + /** + * Returns the {@link Flow} instance containing the given job ID. + * + * @param cluster the cluster identifier + * @param jobId the job identifier + * @return + */ + public Flow getFlowByJobID(String cluster, String jobId, boolean populateTasks) + throws IOException { + Flow flow = null; + JobKey key = idService.getJobKeyById(new QualifiedJobId(cluster, jobId)); + if (key != null) { + byte[] startRow = ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(key.getCluster()), Bytes.toBytes(key.getUserName()), + Bytes.toBytes(key.getAppId()), + Bytes.toBytes(key.getEncodedRunId()), Constants.EMPTY_BYTES); + + LOG.info("Reading job_history rows start at " + + Bytes.toStringBinary(startRow)); + Scan scan = new Scan(); + // start scanning history at cluster!user!app!run! + scan.setStartRow(startRow); + // require that all results match this flow prefix + scan.setFilter(new WhileMatchFilter(new PrefixFilter(startRow))); + + List flows = createFromResults(scan, populateTasks, 1); + if (flows.size() > 0) { + flow = flows.get(0); + } + } + return flow; + } + + /** + * Returns the most recent {@link Flow} runs, up to {@code limit} instances. + * If the {@code version} parameter is non-null, the returned results will be + * restricted to those matching this app version. + * + * @param cluster + * the cluster where the jobs were run + * @param user + * the user running the jobs + * @param appId + * the application identifier for the jobs + * @param version + * if non-null, only flows matching this application version will be + * returned + * @param populateTasks + * if {@code true}, then TaskDetails will be populated for each job + * @param limit + * the maximum number of flows to return + * @return + */ + public List getFlowSeries(String cluster, String user, String appId, + String version, boolean populateTasks, int limit) throws IOException { + // TODO: use RunMatchFilter to limit scan on the server side + byte[] rowPrefix = Bytes.toBytes(cluster + Constants.SEP + user + + Constants.SEP + appId + Constants.SEP); + Scan scan = new Scan(); + scan.setStartRow(rowPrefix); + // require that all rows match the prefix we're looking for + Filter prefixFilter = new WhileMatchFilter(new PrefixFilter(rowPrefix)); + // if version is passed, restrict the rows returned to that version + if (version != null && version.length() > 0) { + FilterList filters = new FilterList(FilterList.Operator.MUST_PASS_ALL); + filters.addFilter(prefixFilter); + filters.addFilter(new SingleColumnValueFilter(Constants.INFO_FAM_BYTES, + Constants.VERSION_COLUMN_BYTES, CompareFilter.CompareOp.EQUAL, Bytes + .toBytes(version))); + scan.setFilter(filters); + } else { + scan.setFilter(prefixFilter); + } + + return createFromResults(scan, populateTasks, limit); + } + + /** + * Returns the {@link Flow} runs' stats - summed up per flow + * If the {@code version} parameter is non-null, the returned results will be + * restricted to those matching this app version. + * + *

+ * Note: this retrieval method will omit the configuration data from + * all of the returned jobs. + *

+ * + * @param cluster + * the cluster where the jobs were run + * @param user + * the user running the jobs + * @param appId + * the application identifier for the jobs + * @param version + * if non-null, only flows matching this application version will be + * returned + * @param startTime + * the start time for the flows to be looked at + * @param endTime + * the end time for the flows to be looked at + * @param limit + * the maximum number of flows to return + * @return + */ + public List getFlowTimeSeriesStats(String cluster, String user, String appId, + String version, long startTime, long endTime, int limit, byte[] startRow) throws IOException { + + // app portion of row key + byte[] rowPrefix = Bytes.toBytes((cluster + Constants.SEP + user + Constants.SEP + + appId + Constants.SEP )); + byte[] scanStartRow; + + if (startRow != null ) { + scanStartRow = startRow; + } else { + if (endTime != 0) { + // use end time in start row, if present + long endRunId = FlowKey.encodeRunId(endTime); + scanStartRow = Bytes.add(rowPrefix, Bytes.toBytes(endRunId), Constants.SEP_BYTES); + } else { + scanStartRow = rowPrefix; + } + } + + // TODO: use RunMatchFilter to limit scan on the server side + Scan scan = new Scan(); + scan.setStartRow(scanStartRow); + FilterList filters = new FilterList(FilterList.Operator.MUST_PASS_ALL); + + if (startTime != 0) { + // if limited by start time, early out as soon as we hit it + long startRunId = FlowKey.encodeRunId(startTime); + // zero byte at the end makes the startRunId inclusive + byte[] scanEndRow = Bytes.add(rowPrefix, Bytes.toBytes(startRunId), + Constants.ZERO_SINGLE_BYTE); + scan.setStopRow(scanEndRow); + } else { + // require that all rows match the app prefix we're looking for + filters.addFilter( new WhileMatchFilter(new PrefixFilter(rowPrefix)) ); + } + + // if version is passed, restrict the rows returned to that version + if (version != null && version.length() > 0) { + filters.addFilter(new SingleColumnValueFilter(Constants.INFO_FAM_BYTES, + Constants.VERSION_COLUMN_BYTES, CompareFilter.CompareOp.EQUAL, Bytes + .toBytes(version))); + } + + // always ignore job configuration data + filters.addFilter( + new QualifierFilter(CompareFilter.CompareOp.NOT_EQUAL, + new BinaryPrefixComparator( + Bytes.add(Constants.JOB_CONF_COLUMN_PREFIX_BYTES, Constants.SEP_BYTES)))); + + scan.setFilter(filters); + + return createFromResults(scan, false, limit); + } + + /** + * Returns a specific job's data by job ID. This version does not populate + * the job's task data. + * @param cluster the cluster identifier + * @param cluster the job ID + */ + public JobDetails getJobByJobID(String cluster, String jobId) throws IOException { + return getJobByJobID(cluster, jobId, false); + } + + /** + * Returns a specific job's data by job ID + * @param cluster the cluster identifier + * @param cluster the job ID + * @param populateTasks if {@code true} populate the {@link TaskDetails} records for the job + */ + public JobDetails getJobByJobID(String cluster, String jobId, boolean populateTasks) + throws IOException { + return getJobByJobID(new QualifiedJobId(cluster, jobId), populateTasks); + } + + /** + * Returns a specific job's data by job ID + * @param jobId the fully qualified cluster + job identifier + * @param populateTasks if {@code true} populate the {@link TaskDetails} records for the job + */ + public JobDetails getJobByJobID(QualifiedJobId jobId, boolean populateTasks) + throws IOException { + JobDetails job = null; + JobKey key = idService.getJobKeyById(jobId); + if (key != null) { + byte[] historyKey = jobKeyConv.toBytes(key); + Result result = historyTable.get(new Get(historyKey)); + if (result != null && !result.isEmpty()) { + job = new JobDetails(key); + job.populate(result); + if (populateTasks) { + populateTasks(job); + } + } + } + return job; + } + + /** + * Returns a list of {@link Flow} instances generated from the given results. + * For the moment, this assumes that the given scanner provides results + * ordered first by flow ID. + * + * @param scan + * the Scan instance setup for retrieval + * @return + */ + private List createFromResults(Scan scan, boolean populateTasks, + int maxCount) throws IOException { + List flows = new ArrayList(); + ResultScanner scanner = null; + try { + Stopwatch timer = new Stopwatch().start(); + int rowCount = 0; + long colCount = 0; + scanner = historyTable.getScanner(scan); + Flow currentFlow = null; + for (Result result : scanner) { + if (result != null && !result.isEmpty()) { + rowCount++; + colCount += result.size(); + JobKey currentKey = jobKeyConv.fromBytes(result.getRow()); + // empty runId is special cased -- we need to treat each job as it's own flow + if (currentFlow == null || !currentFlow.contains(currentKey) || + currentKey.getRunId() == 0) { + // return if we've already hit the limit + if (flows.size() >= maxCount) { + break; + } + currentFlow = new Flow(new FlowKey(currentKey)); + flows.add(currentFlow); + } + JobDetails job = new JobDetails(currentKey); + job.populate(result); + currentFlow.addJob(job); + } + } + timer.stop(); + LOG.debug("Fetched "+rowCount+" rows, " + colCount + " columns in "+timer); + } finally { + if (scanner != null) { + scanner.close(); + } + } + + if (populateTasks) { + populateTasks(flows); + } + + return flows; + } + + /** + * Populate the task details for the jobs in the given flows. Note + * that all flows are expected to share the same cluster, user, and + * appId. + * + * @param flows + */ + private void populateTasks(List flows) throws IOException { + if (flows == null || flows.size() == 0) { + return; + } + + // for simplicity, we assume that flows are ordered and consecutive + JobKey startJob = null; + // find the first job + for (Flow f : flows) { + List jobs = f.getJobs(); + if (jobs != null && jobs.size() > 0) { + startJob = jobs.get(0).getJobKey(); + break; + } + } + + if (startJob == null) { + LOG.info("No start job found for flows"); + return; + } + + byte[] startKey = Bytes.add(jobKeyConv.toBytes(startJob), Constants.SEP_BYTES); + Scan scan = new Scan(); + scan.setStartRow(startKey); + // expect a lot of tasks on average + scan.setCaching(500); + + ResultScanner scanner = this.taskTable.getScanner(scan); + try { + Result currentResult = scanner.next(); + for (Flow f : flows) { + for (JobDetails j : f.getJobs()) { + // within each job we advance through the scanner til we pass keys + // matching the current job + while (currentResult != null && !currentResult.isEmpty()) { + TaskKey taskKey = taskKeyConv.fromBytes(currentResult.getRow()); + // see if this task belongs to the current job + int comparison = j.getJobKey().compareTo(taskKey); + if (comparison < 0) { + // advance to next job (without advancing current result) + break; + } else if (comparison > 0) { + // advance tasks up to current job + } else { + // belongs to the current job + TaskDetails task = new TaskDetails(taskKey); + task.populate(currentResult + .getFamilyMap(Constants.INFO_FAM_BYTES)); + j.addTask(task); + } + currentResult = scanner.next(); + } + if (LOG.isDebugEnabled()) { + LOG.debug("Added " + j.getTasks().size() + " tasks to job " + + j.getJobKey().toString()); + } + } + } + } finally { + scanner.close(); + } + } + + /** + * Populate the task details for a specific job. To populate tasks for multiple + * jobs together, use {@link JobHistoryService#populateTasks(java.util.List)}. + * @param job + */ + private void populateTasks(JobDetails job) throws IOException { + // TODO: see if we can merge common logic here with populateTasks(List) + Scan scan = getTaskScan(job.getJobKey()); + ResultScanner scanner = this.taskTable.getScanner(scan); + try { + // advance through the scanner til we pass keys matching the job + for (Result currentResult : scanner) { + if (currentResult == null || currentResult.isEmpty()) { + break; + } + + TaskKey taskKey = taskKeyConv.fromBytes(currentResult.getRow()); + TaskDetails task = new TaskDetails(taskKey); + task.populate(currentResult + .getFamilyMap(Constants.INFO_FAM_BYTES)); + job.addTask(task); + } + if (LOG.isDebugEnabled()) { + LOG.debug("Added " + job.getTasks().size() + " tasks to job " + + job.getJobKey().toString()); + } + } finally { + scanner.close(); + } + } + + /** + * Returns a Scan instance to retrieve all the task rows for a given job + * from the job_history_task table. + * @param jobKey the job key to match for all task rows + * @return a {@code Scan} instance for the job_history_task table + */ + private Scan getTaskScan(JobKey jobKey) { + byte[] startKey = Bytes.add(jobKeyConv.toBytes(jobKey), Constants.SEP_BYTES); + Scan scan = new Scan(); + scan.setStartRow(startKey); + // only return tasks for this job + scan.setFilter(new WhileMatchFilter(new PrefixFilter(startKey))); + // expect a lot of tasks on average + scan.setCaching(500); + return scan; + } + + /** + * Converts serialized configuration properties back in to a Configuration + * object. + * + * @param keyValues + * @return + */ + public static Configuration parseConfiguration(Map keyValues) { + Configuration config = new Configuration(false); + byte[] configPrefix = Bytes.add(Constants.JOB_CONF_COLUMN_PREFIX_BYTES, + Constants.SEP_BYTES); + for (Map.Entry entry : keyValues.entrySet()) { + byte[] key = entry.getKey(); + if (Bytes.startsWith(key, configPrefix) + && key.length > configPrefix.length) { + byte[] name = Bytes.tail(key, key.length - configPrefix.length); + config.set(Bytes.toString(name), Bytes.toString(entry.getValue())); + } + } + + return config; + } + + /** + * Converts encoded key values back into counter objects. + * + * @param keyValues + * @return + */ + public static CounterMap parseCounters(byte[] prefix, + Map keyValues) { + CounterMap counterValues = new CounterMap(); + byte[] counterPrefix = Bytes.add(prefix, Constants.SEP_BYTES); + for (Map.Entry entry : keyValues.entrySet()) { + byte[] key = entry.getKey(); + if (Bytes.startsWith(key, counterPrefix) + && key.length > counterPrefix.length) { + // qualifier should be in the format: g!countergroup!counterkey + byte[][] qualifierFields = ByteUtil.split( + Bytes.tail(key, key.length - counterPrefix.length), + Constants.SEP_BYTES); + if (qualifierFields.length != 2) { + throw new IllegalArgumentException( + "Malformed column qualifier for counter value: " + + Bytes.toStringBinary(key)); + } + Counter c = new Counter(Bytes.toString(qualifierFields[0]), + Bytes.toString(qualifierFields[1]), Bytes.toLong(entry.getValue())); + counterValues.add(c); + } + } + + return counterValues; + } + + /** + * Returns the HBase {@code Put} instances to store for the given + * {@code Configuration} data. Each configuration property will be stored as a + * separate key value. + * + * @param jobDesc + * the {@link JobDesc} generated for the job + * @param jobConf + * the job configuration + * @return puts for the given job configuration + */ + public static List getHbasePuts(JobDesc jobDesc, Configuration jobConf) { + List puts = new LinkedList(); + + JobKey jobKey = new JobKey(jobDesc); + byte[] jobKeyBytes = new JobKeyConverter().toBytes(jobKey); + + // Add all columns to one put + Put jobPut = new Put(jobKeyBytes); + jobPut.add(Constants.INFO_FAM_BYTES, Constants.VERSION_COLUMN_BYTES, + Bytes.toBytes(jobDesc.getVersion())); + jobPut.add(Constants.INFO_FAM_BYTES, Constants.FRAMEWORK_COLUMN_BYTES, + Bytes.toBytes(jobDesc.getFramework().toString())); + + // Avoid doing string to byte conversion inside loop. + byte[] jobConfColumnPrefix = Bytes.toBytes(Constants.JOB_CONF_COLUMN_PREFIX + + Constants.SEP); + + // Create puts for all the parameters in the job configuration + Iterator> jobConfIterator = jobConf.iterator(); + while (jobConfIterator.hasNext()) { + Entry entry = jobConfIterator.next(); + // Prefix the job conf entry column with an indicator to + byte[] column = Bytes.add(jobConfColumnPrefix, + Bytes.toBytes(entry.getKey())); + jobPut.add(Constants.INFO_FAM_BYTES, column, + Bytes.toBytes(entry.getValue())); + } + + puts.add(jobPut); + + return puts; + } + + /** + * Removes the job's row from the job_history table, and all related task rows + * from the job_history_task table. + * @param key the job to be removed + * @return the number of rows deleted. + * @throws IOException + */ + public int removeJob(JobKey key) throws IOException { + byte[] jobRow = jobKeyConv.toBytes(key); + + historyTable.delete(new Delete(jobRow)); + + int deleteCount = 1; + + // delete all task rows + Scan taskScan = getTaskScan(key); + // only need the row keys back to delete (all should have taskid) + taskScan.addColumn(Constants.INFO_FAM_BYTES, + JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TASKID)); + // no reason to cache rows we're deleting + taskScan.setCacheBlocks(false); + List taskDeletes = new ArrayList(); + ResultScanner scanner = taskTable.getScanner(taskScan); + try { + for (Result r : scanner) { + if (r != null && !r.isEmpty()) { + byte[] rowKey = r.getRow(); + TaskKey taskKey = taskKeyConv.fromBytes(rowKey); + if (!key.equals(taskKey)) { + LOG.warn("Found task not in the current job "+Bytes.toStringBinary(rowKey)); + break; + } + taskDeletes.add(new Delete(r.getRow())); + } + } + // Hang on the count because delete will modify our list. + deleteCount += taskDeletes.size(); + if (taskDeletes.size() > 0) { + LOG.info("Deleting "+taskDeletes.size()+" tasks for job "+key); + taskTable.delete(taskDeletes); + } + } finally { + scanner.close(); + } + return deleteCount; + } + + /** + * Cleans up the internal HBase table instances. This should always be called + * when the service instance is being released. + * + * @throws IOException + */ + public void close() throws IOException { + IOException caught = null; + if (this.historyTable != null) { + try { + this.historyTable.close(); + } catch (IOException ioe) { + caught = ioe; + } + } + if (this.idService != null) { + try { + this.idService.close(); + } catch (IOException ioe) { + // TODO: don't overwrite a previous exception + caught = ioe; + } + } + if (this.taskTable != null) { + try { + this.taskTable.close(); + } catch (IOException ioe) { + // TODO: don't overwrite a previous exception + caught = ioe; + } + } + if (caught != null) { + throw caught; + } + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/JobIdConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobIdConverter.java new file mode 100644 index 0000000..1b3436d --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobIdConverter.java @@ -0,0 +1,42 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.JobId; + +/** + */ +public class JobIdConverter implements ByteConverter { + @Override + public byte[] toBytes(JobId jobId) { + return Bytes.add(Bytes.toBytes(jobId.getJobEpoch()), + Bytes.toBytes(jobId.getJobSequence())); + } + + @Override + public JobId fromBytes(byte[] bytes) { + if (bytes == null || bytes.length < 16) { + return null; + } + + // expect a packed bytes encoding of [8 bytes epoch][8 bytes seq] + long epoch = Bytes.toLong(bytes, 0); + long seq = Bytes.toLong(bytes, 8); + return new JobId(epoch, seq); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/JobKeyConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobKeyConverter.java new file mode 100644 index 0000000..241325c --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/JobKeyConverter.java @@ -0,0 +1,142 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobId; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.util.ByteUtil; + +/** + */ +public class JobKeyConverter implements ByteConverter { + private JobIdConverter idConv = new JobIdConverter(); + + /** + * Returns the byte encoded representation of a JobKey + * + * @param jobKey the JobKey to serialize + * @return the byte encoded representation of the JobKey + */ + @Override + public byte[] toBytes(JobKey jobKey) { + if (jobKey == null) { + return Constants.EMPTY_BYTES; + } else { + return ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(jobKey.getCluster()), + Bytes.toBytes(jobKey.getUserName()), + Bytes.toBytes(jobKey.getAppId()), + Bytes.toBytes(jobKey.getEncodedRunId()), + idConv.toBytes(jobKey.getJobId())); + } + } + + /** + * Reverse operation of + * {@link JobKeyConverter#toBytes(com.twitter.hraven.JobKey)} + * + * @param bytes the serialized version of a JobKey + * @return a deserialized JobKey instance + */ + @Override + public JobKey fromBytes(byte[] bytes) { + byte[][] splits = splitJobKey(bytes); + // no combined runId + jobId, parse as is + return parseJobKey(splits); + } + + /** + * Constructs a JobKey instance from the individual byte encoded key + * components. + * + * @param keyComponents + * as split on + * @return a JobKey instance containing the decoded components + */ + public JobKey parseJobKey(byte[][] keyComponents) { + // runId is inverted in the bytes representation so we get reverse + // chronological order + long encodedRunId = keyComponents.length > 3 ? + Bytes.toLong(keyComponents[3]) : Long.MAX_VALUE; + + JobId jobId = keyComponents.length > 4 ? + idConv.fromBytes(keyComponents[4]) : null; + + return new JobKey(Bytes.toString(keyComponents[0]), + (keyComponents.length > 1 ? Bytes.toString(keyComponents[1]) : null), + (keyComponents.length > 2 ? Bytes.toString(keyComponents[2]) : null), + Long.MAX_VALUE - encodedRunId, + jobId); + } + + /** + * Handles splitting the encoded job key correctly, accounting for long + * encoding of the run ID. Since the long encoding of the run ID may + * legitimately contain the separator bytes, we first split the leading 3 + * elements (cluster!user!appId), then split out the runId and remaining + * fields based on the encoded long length; + * + * @param rawKey byte encoded representation of the job key + * @return + */ + static byte[][] splitJobKey(byte[] rawKey) { + byte[][] splits = ByteUtil.split(rawKey, Constants.SEP_BYTES, 4); + + /* final components (runId!jobId!additional) need to be split separately for correct + * handling of runId long encoding */ + if (splits.length == 4) { + // TODO: this splitting is getting really ugly, look at using Orderly instead for keying + byte[] remainder = splits[3]; + byte[][] extraComponents = new byte[3][]; + + int offset = 0; + // run ID + extraComponents[0] = ByteUtil.safeCopy(remainder, offset, 8); + // followed by sep + job epoch + job seq + offset += 8+Constants.SEP_BYTES.length; + extraComponents[1] = ByteUtil.safeCopy(remainder, offset, 16); + offset += 16+Constants.SEP_BYTES.length; + // followed by any remainder + extraComponents[2] = ByteUtil.safeCopy(remainder, offset, remainder.length - offset); + + int extraSize = 0; + // figure out the full size of all splits + for (int i=0; i < extraComponents.length; i++) { + if (extraComponents[i] != null) { + extraSize++; + } else { + break; // first null signals hitting the end of remainder + } + } + + byte[][] allComponents = new byte[3+extraSize][]; + // fill in the first 3 elts + for (int i=0; i < 3; i++) { + allComponents[i] = splits[i]; + } + // add any extra that were non-null + for (int i=0; i < extraSize; i++) { + allComponents[3+i] = extraComponents[i]; + } + + return allComponents; + } + return splits; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/MissingColumnInResultException.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/MissingColumnInResultException.java new file mode 100644 index 0000000..b6f45b5 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/MissingColumnInResultException.java @@ -0,0 +1,67 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Indicates that the {@link Result} from a {@link Scan} is missing an expected + * column. + *

+ * Specifically, this exception indicates that the {@link KeyValue} returned by + * {@link Result#getColumnLatest(byte[], byte[])} is null or the + * list returned by {@link Result#getColumn(byte[], byte[]) is empty. + */ +public class MissingColumnInResultException extends Exception { + + private static final long serialVersionUID = 2561802650466866719L; + + + private final byte [] family; + private final byte [] qualifier; + + /** + * Constructs an exception indicating that the specified column + * @param family + * @param qualifier + */ + public MissingColumnInResultException(byte [] family, byte [] qualifier) { + super("Missing column: " + Bytes.toString(qualifier) + " from column family: " + + Bytes.toString(family)); + this.family = family; + this.qualifier = qualifier; + } + + /** + * @return the family for which a column was missing. + */ + public byte[] getFamily() { + return family; + } + + /** + * @return the qualifier indicating which column was missing. + */ + public byte[] getQualifier() { + return qualifier; + } + + + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/ProcessingException.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/ProcessingException.java new file mode 100644 index 0000000..0387001 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/ProcessingException.java @@ -0,0 +1,31 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +/** + */ +public class ProcessingException extends RuntimeException { + + private static final long serialVersionUID = -5606549071630261979L; + + public ProcessingException(String message) { + super(message); + } + + public ProcessingException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/QualifiedJobIdConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/QualifiedJobIdConverter.java new file mode 100644 index 0000000..97ed6e8 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/QualifiedJobIdConverter.java @@ -0,0 +1,47 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobId; +import com.twitter.hraven.QualifiedJobId; +import com.twitter.hraven.util.ByteUtil; + +/** + */ +public class QualifiedJobIdConverter implements ByteConverter { + JobIdConverter jobIdConv = new JobIdConverter(); + + @Override + public byte[] toBytes(QualifiedJobId id) { + return ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(id.getCluster()), + jobIdConv.toBytes(id)); + } + + @Override + public QualifiedJobId fromBytes(byte[] bytes) { + byte[][] parts = ByteUtil.split(bytes, Constants.SEP_BYTES, 2); + if (parts.length != 2) { + throw new IllegalArgumentException("Invalid encoded ID, must be 2 parts"); + } + String cluster = Bytes.toString(parts[0]); + JobId jobId = jobIdConv.fromBytes(parts[1]); + return new QualifiedJobId(cluster, jobId); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/RowKeyParseException.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/RowKeyParseException.java new file mode 100644 index 0000000..4d2a2c3 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/RowKeyParseException.java @@ -0,0 +1,39 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +/** + * This exception indicates that a row key could not be parsed successfully. + */ +public class RowKeyParseException extends Exception { + + private static final long serialVersionUID = 839389516279735249L; + + /** + * @param message + */ + public RowKeyParseException(String message) { + super(message); + } + + /** + * @param message + * @param cause + */ + public RowKeyParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/RunMatchFilter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/RunMatchFilter.java new file mode 100644 index 0000000..537a1d5 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/RunMatchFilter.java @@ -0,0 +1,112 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hbase.filter.FilterBase; +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.util.ByteUtil; + +/** + * Match up to N runs of a given app. Once N runs have been seen, we filter all + * remaining rows. + */ +public class RunMatchFilter extends FilterBase { + private byte[] appId; + private int maxCount; + private byte[] lastRunId = null; + private int seenCount; + + /** + * Match only a single run of the given appId + * @param appId + */ + public RunMatchFilter(String appId) { + this(appId, 1); + } + + /** + * Match up to maxCount runs of the given appId + * + * @param appId + * @param maxCount + */ + public RunMatchFilter(String appId, int maxCount) { + this.appId = Bytes.toBytes(appId); + this.maxCount = maxCount; + } + + @Override + public void reset() { + this.seenCount = 0; + } + + @Override + public boolean filterRowKey(byte[] buffer, int offset, int length) { + // TODO: don't copy the byte[] + byte[] rowkey = new byte[length]; + System.arraycopy(buffer, offset, rowkey, 0, length); + List splits = ByteUtil.splitRanges(rowkey, Constants.SEP_BYTES); + if (splits.size() < 4) { + // invalid row key + return true; + } + ByteUtil.Range appRange = splits.get(1); + int appCompare = Bytes.compareTo(appId, 0, appId.length, + rowkey, appRange.start(), appRange.length()); + if (appCompare != 0) { + return false; + } + ByteUtil.Range runRange = splits.get(2); + int runLength = runRange.length(); + if (lastRunId == null || + Bytes.compareTo(lastRunId, 0, lastRunId.length, + rowkey, runRange.start(), runLength) != 0) { + lastRunId = new byte[runLength]; + System.arraycopy(rowkey, runRange.start(), lastRunId, 0, runLength); + seenCount++; + } + + return seenCount > maxCount; + } + + @Override + public boolean filterAllRemaining() { + // once we've seen the limit number of runs, skip everything else + return seenCount > maxCount; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeInt(appId.length); + out.write(appId); + out.writeInt(maxCount); + } + + @Override + public void readFields(DataInput in) throws IOException { + int appIdLength = in.readInt(); + this.appId = new byte[appIdLength]; + in.readFully(appId); + this.maxCount = in.readInt(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/TaskKeyConverter.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/TaskKeyConverter.java new file mode 100644 index 0000000..1e1a398 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/TaskKeyConverter.java @@ -0,0 +1,55 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import org.apache.hadoop.hbase.util.Bytes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.TaskKey; + +/** + */ +public class TaskKeyConverter implements ByteConverter { + private JobKeyConverter jobKeyConv = new JobKeyConverter(); + + /** + * Returns the bytes representation for a TaskKey. + * + * @param taskKey + * the TaskKey instance to serialize + * @return the serialized representation of the TaskKey + */ + @Override + public byte[] toBytes(TaskKey taskKey) { + return Bytes.add(jobKeyConv.toBytes(taskKey), Constants.SEP_BYTES, + Bytes.toBytes(taskKey.getTaskId())); + } + + /** + * Generates a TaskKey from the byte encoded format. + * + * @param bytes the serialized version of a task key + * @return the deserialized TaskKey instance + */ + @Override + public TaskKey fromBytes(byte[] bytes) { + byte[][] keyComponents = JobKeyConverter.splitJobKey(bytes); + JobKey jobKey = jobKeyConv.parseJobKey(keyComponents); + return new TaskKey(jobKey, + (keyComponents.length > 5 ? Bytes.toString(keyComponents[5]) : null)); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/datasource/VersionInfo.java b/hraven-core/src/main/java/com/twitter/hraven/datasource/VersionInfo.java new file mode 100644 index 0000000..5d996c8 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/datasource/VersionInfo.java @@ -0,0 +1,84 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.hraven.datasource; + +import org.apache.commons.lang.builder.HashCodeBuilder; + +public class VersionInfo implements Comparable { + + private String version; + private long timestamp; + + public VersionInfo(String v, long ts) { + this.version = v; + this.timestamp = ts; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public long getTimestamp() { + return timestamp; + } + + public void setTimestamp(long timestamp) { + this.timestamp = timestamp; + } + + /** + * Compares two VersionInfo timestamps to order them in reverse chronological + * order + * + * @param other + * @return 0 if timestamps are equal, 1 if this timestamp less than other + * timestamp, -1 if this timestamp is greater than other timestamp + * + */ + @Override + public int compareTo(VersionInfo other) { + if (this.timestamp == other.timestamp) { + return 0; + } + if (this.timestamp < other.timestamp) { + return 1; + } + return -1; + } + + @Override + public boolean equals(Object other) { + if (other instanceof VersionInfo) { + VersionInfo otherVersionInfo = (VersionInfo) other; + return (this.timestamp == otherVersionInfo.timestamp) + && (this.version.equals(otherVersionInfo.version)); + } + return false; + } + + @Override + public int hashCode(){ + return new HashCodeBuilder() + .append(this.timestamp) + .append(this.version) + .toHashCode(); + } + +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/ObjectMapperProvider.java b/hraven-core/src/main/java/com/twitter/hraven/rest/ObjectMapperProvider.java new file mode 100644 index 0000000..e7e77f4 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/ObjectMapperProvider.java @@ -0,0 +1,199 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.rest; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; + +import javax.ws.rs.ext.ContextResolver; +import javax.ws.rs.ext.Provider; + +import org.apache.hadoop.conf.Configuration; +import org.codehaus.jackson.JsonGenerator; +import org.codehaus.jackson.Version; +import org.codehaus.jackson.map.JsonSerializer; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.map.SerializationConfig.Feature; +import org.codehaus.jackson.map.SerializerProvider; +import org.codehaus.jackson.map.module.SimpleModule; + +import com.google.common.base.Predicate; +import com.twitter.hraven.Counter; +import com.twitter.hraven.CounterMap; +import com.twitter.hraven.Flow; + +/** + * Class that provides custom JSON bindings (where needed) for out object model. + */ +@Provider +public class ObjectMapperProvider implements ContextResolver { + private final ObjectMapper customMapper; + + public ObjectMapperProvider() { + customMapper = createCustomMapper(); + } + + @Override + public ObjectMapper getContext(Class type) { + return customMapper; + } + + public static ObjectMapper createCustomMapper() { + ObjectMapper result = new ObjectMapper(); + result.configure(Feature.INDENT_OUTPUT, true); + SimpleModule module = new SimpleModule("hRavenModule", new Version(0, 4, 0, null)); + addJobMappings(module); + module.addSerializer(Flow.class, new FlowSerializer()); + result.registerModule(module); + return result; + } + + private static SimpleModule addJobMappings(SimpleModule module) { + module.addSerializer(Configuration.class, new ConfigurationSerializer()); + module.addSerializer(CounterMap.class, new CounterSerializer()); + return module; + } + + /** + * Custom serializer for Configuration object. We don't want to serialize the classLoader. + */ + public static class ConfigurationSerializer extends JsonSerializer { + + @Override + public void serialize(Configuration conf, JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) throws IOException { + SerializationContext context = RestJSONResource.serializationContext.get(); + Predicate configFilter = context.getConfigurationFilter(); + Iterator> keyValueIterator = conf.iterator(); + + jsonGenerator.writeStartObject(); + + // here's where we can filter out keys if we want + while (keyValueIterator.hasNext()) { + Map.Entry kvp = keyValueIterator.next(); + if (configFilter == null || configFilter.apply(kvp.getKey())) { + jsonGenerator.writeFieldName(kvp.getKey()); + jsonGenerator.writeString(kvp.getValue()); + } + } + jsonGenerator.writeEndObject(); + } + } + + /** + * Custom serializer for Configuration object. We don't want to serialize the classLoader. + */ + public static class CounterSerializer extends JsonSerializer { + + @Override + public void serialize(CounterMap counterMap, JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) throws IOException { + + jsonGenerator.writeStartObject(); + for (String group : counterMap.getGroups()) { + jsonGenerator.writeFieldName(group); + + jsonGenerator.writeStartObject(); + Map groupMap = counterMap.getGroup(group); + for (String counterName : groupMap.keySet()) { + Counter counter = groupMap.get(counterName); + jsonGenerator.writeFieldName(counter.getKey()); + jsonGenerator.writeNumber(counter.getValue()); + } + jsonGenerator.writeEndObject(); + } + jsonGenerator.writeEndObject(); + } + } + + /** + * Custom serializer for Flow object. We don't want to serialize the + * classLoader. based on the parameters passed by caller, we determine which + * fields to include in serialized response + */ + public static class FlowSerializer extends JsonSerializer { + @Override + public void serialize(Flow aFlow, JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) throws IOException { + SerializationContext.DetailLevel selectedSerialization = + RestJSONResource.serializationContext.get().getLevel(); + if (selectedSerialization == SerializationContext.DetailLevel.EVERYTHING) { + // should generate the json for everything in the flow object + ObjectMapper om = new ObjectMapper(); + om.registerModule( + addJobMappings(new SimpleModule("hRavenModule", new Version(0, 4, 0, null)))); + om.writeValue(jsonGenerator, aFlow); + } else { + jsonGenerator.writeStartObject(); + if (selectedSerialization == SerializationContext.DetailLevel.FLOW_SUMMARY_STATS_ONLY + || selectedSerialization == SerializationContext.DetailLevel.FLOW_SUMMARY_STATS_WITH_JOB_STATS) { + // serialize the FlowKey object + jsonGenerator.writeFieldName("flowKey"); + jsonGenerator.writeObject(aFlow.getFlowKey()); + // serialize individual members of this class + jsonGenerator.writeFieldName("flowName"); + jsonGenerator.writeString(aFlow.getFlowName()); + jsonGenerator.writeFieldName("userName"); + jsonGenerator.writeString(aFlow.getUserName()); + jsonGenerator.writeFieldName("progress"); + jsonGenerator.writeNumber(aFlow.getProgress()); + jsonGenerator.writeFieldName("jobCount"); + jsonGenerator.writeNumber(aFlow.getJobCount()); + jsonGenerator.writeFieldName("totalMaps"); + jsonGenerator.writeNumber(aFlow.getTotalMaps()); + jsonGenerator.writeFieldName("totalReduces"); + jsonGenerator.writeNumber(aFlow.getTotalReduces()); + jsonGenerator.writeFieldName("mapFilesBytesRead"); + jsonGenerator.writeNumber(aFlow.getMapFileBytesRead()); + jsonGenerator.writeFieldName("mapFilesBytesWritten"); + jsonGenerator.writeNumber(aFlow.getMapFileBytesWritten()); + jsonGenerator.writeFieldName("reduceFilesBytesRead"); + jsonGenerator.writeNumber(aFlow.getReduceFileBytesRead()); + jsonGenerator.writeFieldName("hdfsBytesRead"); + jsonGenerator.writeNumber(aFlow.getHdfsBytesRead()); + jsonGenerator.writeFieldName("hdfsBytesWritten"); + jsonGenerator.writeNumber(aFlow.getHdfsBytesWritten()); + jsonGenerator.writeFieldName("mapSlotMillis"); + jsonGenerator.writeNumber(aFlow.getMapSlotMillis()); + jsonGenerator.writeFieldName("reduceSlotMillis"); + jsonGenerator.writeNumber(aFlow.getReduceSlotMillis()); + jsonGenerator.writeFieldName("reduceShuffleBytes"); + jsonGenerator.writeNumber(aFlow.getReduceShuffleBytes()); + jsonGenerator.writeFieldName("duration"); + jsonGenerator.writeNumber(aFlow.getDuration()); + jsonGenerator.writeFieldName("cluster"); + jsonGenerator.writeString(aFlow.getCluster()); + jsonGenerator.writeFieldName("appId"); + jsonGenerator.writeString(aFlow.getAppId()); + jsonGenerator.writeFieldName("runId"); + jsonGenerator.writeNumber(aFlow.getRunId()); + jsonGenerator.writeFieldName("version"); + jsonGenerator.writeString(aFlow.getVersion()); + + // if flag, include job details + if (selectedSerialization == + SerializationContext.DetailLevel.FLOW_SUMMARY_STATS_WITH_JOB_STATS) { + jsonGenerator.writeFieldName("jobs"); + jsonGenerator.writeObject(aFlow.getJobs()); + } + } + jsonGenerator.writeEndObject(); + } + // reset the serializationContext variable back to an initialValue + } + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/PaginatedResult.java b/hraven-core/src/main/java/com/twitter/hraven/rest/PaginatedResult.java new file mode 100644 index 0000000..37bfe76 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/PaginatedResult.java @@ -0,0 +1,102 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.hraven.rest; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Container class that maintains a set of results that can be used for + * retrieving results in a paginated fashion + */ + +public class PaginatedResult { + + // the start row for the next page of results + // if no more results are remaining, this will be null + private byte[] nextStartRow; + + // the number of results to be returned per call + private int limit; + + // request parameters & values + private Map requestParameters; + + // actual values that are to be returned + private List values; + + // basic constructor + public PaginatedResult() { + values = new ArrayList(); + requestParameters = new HashMap(); + // set the next start row to null + // this helps the UI to know that there is no next page + this.setNextStartRow(null); + limit = 0; + } + + // constructor with limit + public PaginatedResult(int limit) { + values = new ArrayList(); + requestParameters = new HashMap(); + this.limit = limit; + // set the next start row to null + // this helps the UI to know that there is no next page + this.setNextStartRow(null); + } + + public List getValues() { + return values; + } + + public void setValues(List inputValues) { + this.values = inputValues; + } + + public void addValue(T value) { + this.values.add(value); + } + + public byte[] getNextStartRow() { + return nextStartRow; + } + + public void setNextStartRow(byte[] nextStartRow) { + this.nextStartRow = nextStartRow; + } + + public int getLimit() { + return limit; + } + + public void setLimit(int limit) { + this.limit = limit; + } + + public Map getRequestParameters() { + return requestParameters; + } + + public void setRequestParameters(Map requestParameters) { + this.requestParameters = requestParameters; + } + + public void addRequestParameter(String param, String value) { + this.requestParameters.put(param, value); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/RestJSONResource.java b/hraven-core/src/main/java/com/twitter/hraven/rest/RestJSONResource.java new file mode 100644 index 0000000..c868b44 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/RestJSONResource.java @@ -0,0 +1,275 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.rest; + +import java.io.IOException; +import java.util.List; + +import javax.ws.rs.DefaultValue; +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.PathParam; +import javax.ws.rs.Produces; +import javax.ws.rs.QueryParam; +import javax.ws.rs.core.MediaType; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; + +import com.google.common.base.Predicate; +import com.sun.jersey.core.util.Base64; +import com.twitter.hraven.Flow; +import com.twitter.hraven.JobDetails; +import com.twitter.hraven.datasource.AppVersionService; +import com.twitter.hraven.datasource.FlowKeyConverter; +import com.twitter.hraven.datasource.JobHistoryService; +import com.twitter.hraven.datasource.VersionInfo; + +/** + * Main REST resource that handles binding the REST API to the JobHistoryService. + * + * TODO: better prevalidation + * TODO: handle null results with empty json object or response code + */ +@Path("/api/v1/") +public class RestJSONResource { + private static final Log LOG = LogFactory.getLog(RestJSONResource.class); + + private static final Configuration HBASE_CONF = HBaseConfiguration.create(); + private static final ThreadLocal serviceThreadLocal = + new ThreadLocal() { + + @Override + protected JobHistoryService initialValue() { + try { + LOG.info("Initializing JobHistoryService"); + return new JobHistoryService(HBASE_CONF); + } catch (IOException e) { + throw new RuntimeException("Could not initialize JobHistoryService", e); + } + } + }; + + private static final ThreadLocal serviceThreadLocalAppVersion = + new ThreadLocal() { + + @Override + protected AppVersionService initialValue() { + try { + LOG.info("Initializing AppVersionService"); + return new AppVersionService(HBASE_CONF); + } catch (IOException e) { + throw new RuntimeException("Could not initialize AppVersionService", e); + } + } + }; + + public static final ThreadLocal serializationContext = + new ThreadLocal() { + @Override + protected SerializationContext initialValue() { + // by default all retrieved data is serialized, overrideable per endpoint + return new SerializationContext(SerializationContext.DetailLevel.EVERYTHING); + } + }; + + @GET + @Path("job/{cluster}/{jobId}") + @Produces(MediaType.APPLICATION_JSON) + public JobDetails getJobById(@PathParam("cluster") String cluster, + @PathParam("jobId") String jobId) throws IOException { + LOG.info("Fetching JobDetails for jobId=" + jobId); + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.EVERYTHING)); + JobDetails jobDetails = getJobHistoryService().getJobByJobID(cluster, jobId); + return jobDetails; + } + + @GET + @Path("jobFlow/{cluster}/{jobId}") + @Produces(MediaType.APPLICATION_JSON) + public Flow getJobFlowById(@PathParam("cluster") String cluster, + @PathParam("jobId") String jobId) throws IOException { + LOG.info(String.format("Fetching Flow for cluster=%s, jobId=%s", cluster, jobId)); + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.EVERYTHING)); + Flow flow = getJobHistoryService().getFlowByJobID(cluster, jobId, false); + return flow; + } + + @GET + @Path("flow/{cluster}/{user}/{appId}/{version}") + @Produces(MediaType.APPLICATION_JSON) + public List getJobFlowById(@PathParam("cluster") String cluster, + @PathParam("user") String user, + @PathParam("appId") String appId, + @PathParam("version") String version, + @QueryParam("limit") int limit, + @QueryParam("includeConf") List includeConfig, + @QueryParam("includeConfRegex") List includeConfigRegex) + throws IOException { + Predicate configFilter = null; + if (includeConfig != null && !includeConfig.isEmpty()) { + configFilter = new SerializationContext.ConfigurationFilter(includeConfig); + } else if (includeConfigRegex != null && !includeConfigRegex.isEmpty()) { + configFilter = new SerializationContext.RegexConfigurationFilter(includeConfigRegex); + } + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.EVERYTHING, configFilter)); + return getFlowList(cluster, user, appId, version, limit); + } + + @GET + @Path("flow/{cluster}/{user}/{appId}") + @Produces(MediaType.APPLICATION_JSON) + public List getJobFlowById(@PathParam("cluster") String cluster, + @PathParam("user") String user, + @PathParam("appId") String appId, + @QueryParam("limit") int limit, + @QueryParam("includeConf") List includeConfig, + @QueryParam("includeConfRegex") List includeConfigRegex) + throws IOException { + Predicate configFilter = null; + if (includeConfig != null && !includeConfig.isEmpty()) { + configFilter = new SerializationContext.ConfigurationFilter(includeConfig); + } else if (includeConfigRegex != null && !includeConfigRegex.isEmpty()) { + configFilter = new SerializationContext.RegexConfigurationFilter(includeConfigRegex); + } + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.EVERYTHING, configFilter)); + return getFlowList(cluster, user, appId, null, limit); + } + + @GET + @Path("flowStats/{cluster}/{user}/{appId}") + @Produces(MediaType.APPLICATION_JSON) + public PaginatedResult getJobFlowStats(@PathParam("cluster") String cluster, + @PathParam("user") String user, + @PathParam("appId") String appId, + @QueryParam("version") String version, + @QueryParam("startRow") String startRowParam, + @QueryParam("startTime") long startTime, + @QueryParam("endTime") long endTime, + @QueryParam("limit") @DefaultValue("100") int limit, + @QueryParam("includeJobs") boolean includeJobs + ) throws IOException { + + byte[] startRow = null; + if (startRowParam != null) { + startRow = Base64.decode(startRowParam); + } + + if (includeJobs) { + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.FLOW_SUMMARY_STATS_WITH_JOB_STATS)); + } else { + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.FLOW_SUMMARY_STATS_ONLY)); + } + + if(endTime == 0) { + endTime = Long.MAX_VALUE; + } + + if( (limit == 0) || (limit == Integer.MAX_VALUE)) { + limit = Integer.MAX_VALUE - 1; + } + + List flows = getJobHistoryService().getFlowTimeSeriesStats(cluster, user, + appId, version, startTime, endTime, limit + 1, startRow); + PaginatedResult flowStatsPage = new PaginatedResult(limit); + // add request parameters + flowStatsPage.addRequestParameter("user", user); + flowStatsPage.addRequestParameter("appId", appId); + if ( StringUtils.isNotBlank(version)){ + flowStatsPage.addRequestParameter("version", version); + } else { + flowStatsPage.addRequestParameter("version", "all"); + } + + flowStatsPage.addRequestParameter("startTime", Long.toString(startTime)); + flowStatsPage.addRequestParameter("endTime", Long.toString(endTime)); + flowStatsPage.addRequestParameter("limit", Integer.toString(limit)); + + if ( startRow != null ){ + flowStatsPage.addRequestParameter("startRow", startRowParam); + } + + if ( includeJobs) { + flowStatsPage.addRequestParameter("includeJobs", "true"); + } else { + flowStatsPage.addRequestParameter("includeJobs", "false"); + } + + if (flows.size() > limit) { + // copy over the last excluding the last element + // the last element is the start row for next page + flowStatsPage.setValues(flows.subList(0, limit)); + flowStatsPage.setNextStartRow(new FlowKeyConverter().toBytes(flows.get(limit).getFlowKey())); + } else { + flowStatsPage.setNextStartRow(null); + flowStatsPage.setValues(flows); + } + return flowStatsPage; + } + + @GET + @Path("appVersion/{cluster}/{user}/{appId}/") + @Produces(MediaType.APPLICATION_JSON) + public List getDistinctVersions(@PathParam("cluster") String cluster, + @PathParam("user") String user, + @PathParam("appId") String appId, + @QueryParam("limit") int limit) throws IOException { + if (LOG.isTraceEnabled()) { + LOG.trace("Fetching App Versions for cluster=" + cluster + " user=" + user + " app=" + appId); + } + serializationContext.set(new SerializationContext( + SerializationContext.DetailLevel.EVERYTHING)); + List distinctVersions = serviceThreadLocalAppVersion.get() + .getDistinctVersions( + StringUtils.trimToEmpty(cluster), + StringUtils.trimToEmpty(user), + StringUtils.trimToEmpty(appId)); + return distinctVersions; + } + + private List getFlowList(String cluster, + String user, + String appId, + String version, + int limit) throws IOException { + if (limit < 1) { limit = 1; } + LOG.info(String.format( + "Fetching Flow series for cluster=%s, user=%s, appId=%s, version=%s, limit=%s", + cluster, user, appId, version, limit)); + + List flows = + getJobHistoryService().getFlowSeries(cluster, user, appId, version, false, limit); + LOG.info(String.format("Found %s flows", flows.size())); + return flows; + } + + private static JobHistoryService getJobHistoryService() throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Returning JobHistoryService %s bound to thread %s", + serviceThreadLocal.get(), Thread.currentThread().getName())); + } + return serviceThreadLocal.get(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/RestServer.java b/hraven-core/src/main/java/com/twitter/hraven/rest/RestServer.java new file mode 100644 index 0000000..f46b4fc --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/RestServer.java @@ -0,0 +1,131 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.rest; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.mortbay.jetty.Connector; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.nio.SelectChannelConnector; +import org.mortbay.jetty.servlet.Context; +import org.mortbay.jetty.servlet.ServletHolder; +import org.mortbay.thread.QueuedThreadPool; + +import com.google.common.util.concurrent.AbstractIdleService; +import com.sun.jersey.api.json.JSONConfiguration; +import com.sun.jersey.spi.container.servlet.ServletContainer; + +/** + * Simple REST server that spawns an embedded Jetty instance to service requests + */ +public class RestServer extends AbstractIdleService { + /** Default TCP port for the server to listen on */ + public static final int DEFAULT_PORT = 8080; + /** Default IP address for the server to listen on */ + public static final String DEFAULT_ADDRESS = "0.0.0.0"; + + private static final Log LOG = LogFactory.getLog(RestServer.class); + + private final String address; + private final int port; + private Server server; + + public RestServer(String address, int port) { + this.address = address; + this.port = port; + } + + @Override + protected void startUp() throws Exception { + // setup the jetty config + ServletHolder sh = new ServletHolder(ServletContainer.class); + sh.setInitParameter("com.sun.jersey.config.property.packages", "com.twitter.hraven.rest"); + sh.setInitParameter(JSONConfiguration.FEATURE_POJO_MAPPING, "true"); + + server = new Server(); + + Connector connector = new SelectChannelConnector(); + connector.setPort(this.port); + connector.setHost(address); + + server.addConnector(connector); + + // TODO: in the future we may want to provide settings for the min and max threads + // Jetty sets the default max thread number to 250, if we don't set it. + // + QueuedThreadPool threadPool = new QueuedThreadPool(); + server.setThreadPool(threadPool); + + server.setSendServerVersion(false); + server.setSendDateHeader(false); + server.setStopAtShutdown(true); + // set up context + Context context = new Context(server, "/", Context.SESSIONS); + context.addServlet(sh, "/*"); + + // start server + server.start(); + } + + @Override + protected void shutDown() throws Exception { + server.stop(); + } + + private static void printUsage(Options opts) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("bin/hraven rest start", "", opts, + "To run the REST server, execute bin/hraven rest start|stop [-p ]", true); + } + + public static void main(String[] args) { + // parse commandline options + Options opts = new Options(); + opts.addOption("p", "port", true, "Port for server to bind to (default 8080)"); + opts.addOption("a", "address", true, "IP address for server to bind to (default 0.0.0.0)"); + CommandLine cmd = null; + try { + cmd = new PosixParser().parse(opts, args); + } catch (ParseException pe) { + LOG.fatal("Failed to parse arguments", pe); + printUsage(opts); + System.exit(1); + } + + String address = DEFAULT_ADDRESS; + int port = DEFAULT_PORT; + if (cmd.hasOption("p")) { + try { + port = Integer.parseInt(cmd.getOptionValue("p")); + } catch (NumberFormatException nfe) { + LOG.fatal("Invalid integer '"+cmd.getOptionValue("p")+"'", nfe); + printUsage(opts); + System.exit(2); + } + } + if (cmd.hasOption("a")) { + address = cmd.getOptionValue("a"); + } + RestServer server = new RestServer(address, port); + server.startAndWait(); + // run until we're done + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/SerializationContext.java b/hraven-core/src/main/java/com/twitter/hraven/rest/SerializationContext.java new file mode 100644 index 0000000..ba15bd8 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/SerializationContext.java @@ -0,0 +1,124 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.rest; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import com.google.common.base.Predicate; +import com.google.common.collect.Lists; + +/** +*/ +public class SerializationContext { + public enum DetailLevel { + + /** + * Indicating that everything in the object should be returned + */ + EVERYTHING, + + /** + * Indicating that only summary stats are to be returned + */ + FLOW_SUMMARY_STATS_ONLY, + + /** + * Indicating that job details along with summary stats are to be returned + */ + FLOW_SUMMARY_STATS_WITH_JOB_STATS; + } + + /** + * Restricts returned job configuration data to specific configuration + * properties. + */ + public static class ConfigurationFilter implements Predicate { + private final Set allowedKeys; + + public ConfigurationFilter(List keys) { + if (keys != null) { + this.allowedKeys = new HashSet(keys); + } else { + this.allowedKeys = null; + } + } + + /** + * Returns true if the given configuration property + * is contained in the set of allowed configuration keys. + */ + @Override + public boolean apply(String potentialKey) { + return allowedKeys != null && allowedKeys.contains(potentialKey); + } + } + + /** + * Restricts returned job configuration data to configuration properties matching a set + * of regular expressions. + */ + public static class RegexConfigurationFilter implements Predicate { + private final List allowedPatterns; + + public RegexConfigurationFilter(List patterns) { + if (patterns != null) { + allowedPatterns = Lists.newArrayListWithCapacity(patterns.size()); + for (String p : patterns) { + allowedPatterns.add(Pattern.compile(p)); + } + } else { + allowedPatterns = null; + } + } + + @Override + public boolean apply(String potentialKey) { + if (allowedPatterns != null) { + for (Pattern p : allowedPatterns) { + if (p.matcher(potentialKey).matches()) { + return true; + } + } + } + return false; + } + } + + private final DetailLevel level; + private final Predicate filter; + + public SerializationContext(DetailLevel serializationLevel) { + this.level = serializationLevel; + this.filter = null; + } + + public SerializationContext(DetailLevel serializationLevel, + Predicate filter) { + this.level = serializationLevel; + this.filter = filter; + } + + public DetailLevel getLevel() { + return this.level; + } + + public Predicate getConfigurationFilter() { + return this.filter; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/rest/client/HRavenRestClient.java b/hraven-core/src/main/java/com/twitter/hraven/rest/client/HRavenRestClient.java new file mode 100644 index 0000000..af53f6a --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/rest/client/HRavenRestClient.java @@ -0,0 +1,299 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.rest.client; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.type.TypeReference; + +import com.twitter.hraven.Flow; +import com.twitter.hraven.JobDetails; +import com.twitter.hraven.datasource.JobHistoryService; +import com.twitter.hraven.rest.ObjectMapperProvider; +import com.twitter.hraven.util.JSONUtil; +import com.twitter.hraven.util.StringUtil; + +/** + * + */ +public class HRavenRestClient { + private static final Log LOG = LogFactory.getLog(HRavenRestClient.class); + + private String apiHostname; + private int connectTimeout; + private int readTimeout; + + /** + * Initializes with the given hostname and a default connect and read timeout of 5 seconds. + * @param apiHostname the hostname to connect to + */ + public HRavenRestClient(String apiHostname) { + this(apiHostname, 5000, 5000); + } + + public HRavenRestClient(String apiHostname, int connectTimeout, int readTimeout) { + this.apiHostname = apiHostname; + this.connectTimeout = connectTimeout; + this.readTimeout = readTimeout; + LOG.info(String.format( + "Initializing HRavenRestClient with apiHostname=%s, connectTimeout=%d ms, readTimeout=%d ms", + apiHostname, connectTimeout, readTimeout)); + } + + public List fetchFlows(String cluster, + String username, + String batchDesc, + String signature, + int limit) throws IOException { + LOG.info(String.format("Fetching last %d matching jobs for cluster=%s, user.name=%s, " + + "batch.desc=%s, pig.logical.plan.signature=%s", limit, cluster, username, batchDesc, signature)); + + String urlString = signature == null ? + String.format("http://%s/api/v1/flow/%s/%s/%s?limit=%d", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), limit) : + String.format("http://%s/api/v1/flow/%s/%s/%s/%s?limit=%d", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), signature, limit); + + if (LOG.isInfoEnabled()) { + LOG.info("Requesting job history from " + urlString); + } + + return retrieveFlowsFromURL(urlString); + } + + public List fetchFlowsWithConfig(String cluster, + String username, + String batchDesc, + String signature, + int limit, + String... configProps) throws IOException { + LOG.info(String.format("Fetching last %d matching jobs for cluster=%s, user.name=%s, " + + "batch.desc=%s, pig.logical.plan.signature=%s", limit, cluster, username, batchDesc, signature)); + + String configParam = ""; + if (configProps != null && configProps.length > 0) { + configParam = buildConfigParam("includeConf", configProps); + } + String urlString = signature == null ? + String.format("http://%s/api/v1/flow/%s/%s/%s?limit=%d&%s", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), limit, configParam) : + String.format("http://%s/api/v1/flow/%s/%s/%s/%s?limit=%d&%s", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), signature, limit, + configParam); + + if (LOG.isInfoEnabled()) { + LOG.info("Requesting job history from " + urlString); + } + + return retrieveFlowsFromURL(urlString); + } + + public List fetchFlowsWithConfigPatterns(String cluster, + String username, + String batchDesc, + String signature, + int limit, + String... configPatterns) throws IOException { + LOG.info(String.format("Fetching last %d matching jobs for cluster=%s, user.name=%s, " + + "batch.desc=%s, pig.logical.plan.signature=%s", limit, cluster, username, batchDesc, signature)); + + String configParam = ""; + if (configPatterns != null && configPatterns.length > 0) { + configParam = buildConfigParam("includeConfRegex", configPatterns); + } + String urlString = signature == null ? + String.format("http://%s/api/v1/flow/%s/%s/%s?limit=%d&%s", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), limit, configParam) : + String.format("http://%s/api/v1/flow/%s/%s/%s/%s?limit=%d&%s", + apiHostname, cluster, username, StringUtil.cleanseToken(batchDesc), signature, limit, + configParam); + + if (LOG.isInfoEnabled()) { + LOG.info("Requesting job history from " + urlString); + } + + return retrieveFlowsFromURL(urlString); + } + + private String buildConfigParam(String paramName, String[] paramArgs) throws IOException { + StringBuilder sb = new StringBuilder(); + for (String arg : paramArgs) { + if (sb.length() > 0) { + sb.append("&"); + } + sb.append(paramName).append("=").append(URLEncoder.encode(arg, "UTF-8")); + } + return sb.toString(); + } + + @SuppressWarnings("unchecked") + private List retrieveFlowsFromURL(String endpointURL) throws IOException { + InputStream input = null; + try { + URL url = new URL(endpointURL); + URLConnection connection = url.openConnection(); + connection.setConnectTimeout(this.connectTimeout); + connection.setReadTimeout(this.readTimeout); + input = connection.getInputStream(); + return (List) JSONUtil.readJson(input, new TypeReference>() {}); + } finally { + if (input != null) { + try { + input.close(); + } catch (IOException e) { + LOG.warn(e); + } + } + } + } + + private static DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + + public static void main(String[] args) throws IOException { + String apiHostname = null; + String cluster = null; + String username = null; + String batchDesc = null; + String signature = null; + int limit = 2; + boolean useHBaseAPI = false; + boolean dumpJson = false; + + StringBuffer usage = new StringBuffer("Usage: java "); + usage.append(HRavenRestClient.class.getName()).append(" [-options]\n"); + usage.append("Returns data from recent flows and their associated jobs\n"); + usage.append("where options include: \n"); + usage.append(" -a [required]\n"); + usage.append(" -c [required]\n"); + usage.append(" -u [required]\n"); + usage.append(" -f [required]\n"); + usage.append(" -s \n"); + usage.append(" -l \n"); + usage.append(" -h - print this message and return\n"); + usage.append(" -H - use HBase API, not the REST API\n"); + usage.append(" -j - output json"); + + for (int i = 0; i < args.length; i++) { + if("-a".equals(args[i])) { + apiHostname = args[++i]; + continue; + } else if("-c".equals(args[i])) { + cluster = args[++i]; + continue; + } else if("-u".equals(args[i])) { + username = args[++i]; + continue; + } else if("-f".equals(args[i])) { + batchDesc = args[++i]; + continue; + } else if("-s".equals(args[i])) { + signature = args[++i]; + continue; + } else if("-l".equals(args[i])) { + limit = Integer.parseInt(args[++i]); + continue; + } else if("-H".equals(args[i])) { + useHBaseAPI = true; + continue; + } else if("-j".equals(args[i])) { + dumpJson = true; + continue; + } else if ("-h".equals(args[i])) { + System.err.println(usage.toString()); + System.exit(1); + } else { + + } + } + + if (apiHostname == null || cluster == null || username == null || batchDesc == null) { + System.err.println(usage.toString()); + System.exit(1); + } + + List flows; + if (useHBaseAPI) { + JobHistoryService jobHistoryService = new JobHistoryService(HBaseConfiguration.create()); + flows = jobHistoryService.getFlowSeries(cluster, username, batchDesc, signature, false, limit); + } else { + HRavenRestClient client = new HRavenRestClient(apiHostname); + flows = client.fetchFlows(cluster, username, batchDesc, signature, limit); + } + + if(dumpJson) { + ObjectMapper om = ObjectMapperProvider.createCustomMapper(); + System.out.println(om.writeValueAsString(flows)); + return; + } + + System.out.println("Found " + flows.size() + " flows"); + StringBuilder sb = new StringBuilder(); + sb.append("\t\t").append("jobId"); + sb.append("\t\t").append("version"); + sb.append("\t\t").append("status"); + sb.append("\t").append("maps"); + sb.append("\t").append("reduces"); + sb.append("\t").append("rBytesRead"); + sb.append("\t").append("feature"); + sb.append("\t\t\t").append("alias"); + System.out.println(sb.toString()); + + int i = 0; + for (Flow flow : flows) { + long minSubmitTime = -1, maxFinishTime = -1; + for (JobDetails job : flow.getJobs()) { + if (minSubmitTime == -1 && job.getSubmitTime() > 0) { + minSubmitTime = job.getSubmitTime(); + } + minSubmitTime = Math.min(minSubmitTime, job.getSubmitTime()); + maxFinishTime = Math.max(maxFinishTime, job.getFinishTime()); + } + + if (minSubmitTime > 0 && maxFinishTime > 0) { + System.out.println(String.format("Flow #%d: %s - %s", i++, + DATE_FORMAT.format(minSubmitTime), DATE_FORMAT.format(maxFinishTime))); + } else { + System.out.println(String.format("Flow #%d:", i++)); + } + + for (JobDetails job : flow.getJobs()) { + sb = new StringBuilder(); + sb.append(" - ").append(job.getJobId()); + sb.append("\t").append(job.getVersion()); + sb.append("\t").append(job.getStatus()); + sb.append("\t").append(job.getTotalMaps()); + sb.append("\t").append(job.getTotalReduces()); + long reduceBytesRead = job.getReduceCounters().getCounter("FileSystemCounters", "FILE_BYTES_READ") != null ? + job.getReduceCounters().getCounter("FileSystemCounters", "FILE_BYTES_READ").getValue() : -1; + sb.append("\t").append(reduceBytesRead); + sb.append("\t").append(job.getConfiguration().get("pig.job.feature")); + sb.append("\t").append(job.getConfiguration().get("pig.alias")); + System.out.println(sb.toString()); + } + } + } +} \ No newline at end of file diff --git a/hraven-core/src/main/java/com/twitter/hraven/util/BatchUtil.java b/hraven-core/src/main/java/com/twitter/hraven/util/BatchUtil.java new file mode 100644 index 0000000..50037ab --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/util/BatchUtil.java @@ -0,0 +1,121 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import com.twitter.hraven.Range; + +/** + * Utility class that helps process items in batches. + */ +public class BatchUtil { + + /** + * Method that can be used when iterating over an array and you want to retain + * only maxRetention items. + * + * @param i + * index of element in ordered array of length + * @param maxRetention + * total number of elements to retain. + * @param length + * of the ordered array + * @return whether this element should be retained or not. + */ + public static boolean shouldRetain(int i, int maxRetention, int length) { + // Files with a zero-based index greater or equal than the retentionCutoff + // should be retained. + int retentionCutoff = length - maxRetention; + boolean retain = (i >= retentionCutoff) ? true : false; + return retain; + } + + /** + * @param length + * of the items to process + * @param batchSize + * size of the batch. + * @return return the number of batches it takes to process length items in + * batchSize batches. + */ + public static int getBatchCount(int length, int batchSize) { + // Avoid negative numbers. + if ((batchSize < 1) || (length < 1)) { + return 0; + } + int remainder = length % batchSize; + return (remainder > 0) ? (length / batchSize) + 1 : (length / batchSize); + } + + /** + * Given an (ordered) {@link Collection} of non-null, iterate + * over the collection and return a list of ranges of the given batchSize. The + * last range may be smaller and contains the remainder if the collection is + * not equally divisible in batchSize ranges. + * + * @param + * The class of Elements in the collection out of which to create + * ranges. + * + * @param collection + * of non-null elements to chop up in ranges. + * @param batchSize + * the size to chop the collection into. Must be larger than + * 1 and can be larger than the collection. + * @return a non-null list of ranges. + *

+ * For example, getRanges([1,2,3], 1) is + * [1-1,2-2,3-3] and + * getRanges([1,2,3,4,5,6,7,8,9,10], 3) is + * [1-3,4-6,7-9,10-10] and + * getRanges([1,2,3,4,5,6,7,8,9,10], 17) is + * [1-10] + */ + public static > List> getRanges(Collection collection, + int batchSize) { + + List> rangeList = new LinkedList>(); + + E currentMin = null; + + // Check for edge cases + if ((collection != null) && (collection.size() > 0) && (batchSize > 0)) { + + int index = 1; + for (E element : collection) { + // Bootstrap first element in the next range + if (currentMin == null) { + currentMin = element; + } + + int mod = index % batchSize; + // On each batchSize items (and the last one) create a range + if ((mod == 0) || (index == collection.size())) { + Range range = new Range(currentMin, element); + rangeList.add(range); + currentMin = null; + } + + index++; + } + } + + return rangeList; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/util/ByteUtil.java b/hraven-core/src/main/java/com/twitter/hraven/util/ByteUtil.java new file mode 100644 index 0000000..6aa4a4f --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/util/ByteUtil.java @@ -0,0 +1,241 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import java.util.ArrayList; +import java.util.List; + +import com.twitter.hraven.Constants; + +/** + */ +public class ByteUtil { + public static class Range { + private int startIdx; + private int endIdx; + + /** + * Defines a range from start index (inclusive) to end index (exclusive). + * + * @param start + * Starting index position + * @param end + * Ending index position (exclusive) + */ + public Range(int start, int end) { + if (start < 0 || end < start) { + throw new IllegalArgumentException( + "Invalid range, required that: 0 <= start <= end; start=" + start + + ", end=" + end); + } + + this.startIdx = start; + this.endIdx = end; + } + + public int start() { + return startIdx; + } + + public int end() { + return endIdx; + } + + public int length() { + return endIdx - startIdx; + } + } + + /** + * Splits the source array into multiple array segments using the given + * separator, up to a maximum of count items. This will naturally produce + * copied byte arrays for each of the split segments. To identify the split + * ranges without the array copies, see + * {@link ByteUtil#splitRanges(byte[], byte[])}. + * + * @param source + * @param separator + * @return + */ + public static byte[][] split(byte[] source, byte[] separator) { + return split(source, separator, -1); + } + + /** + * Splits the source array into multiple array segments using the given + * separator, up to a maximum of count items. This will naturally produce + * copied byte arrays for each of the split segments. To identify the split + * ranges without the array copies, see + * {@link ByteUtil#splitRanges(byte[], byte[])}. + * + * @param source + * @param separator + * @return + */ + public static byte[][] split(byte[] source, byte[] separator, int limit) { + List segments = splitRanges(source, separator, limit); + + byte[][] splits = new byte[segments.size()][]; + for (int i = 0; i < segments.size(); i++) { + Range r = segments.get(i); + byte[] tmp = new byte[r.length()]; + if (tmp.length > 0) { + System.arraycopy(source, r.start(), tmp, 0, r.length()); + } + splits[i] = tmp; + } + return splits; + } + + /** + * Returns a list of ranges identifying [start, end) -- closed, open -- + * positions within the source byte array that would be split using the + * separator byte array. + */ + public static List splitRanges(byte[] source, byte[] separator) { + return splitRanges(source, separator, -1); + } + + /** + * Returns a list of ranges identifying [start, end) -- closed, open -- + * positions within the source byte array that would be split using the + * separator byte array. + * @param source the source data + * @param separator the separator pattern to look for + * @param limit the maximum number of splits to identify in the source + */ + public static List splitRanges(byte[] source, byte[] separator, int limit) { + List segments = new ArrayList(); + int start = 0; + itersource: for (int i = 0; i < source.length; i++) { + for (int j = 0; j < separator.length; j++) { + if (source[i + j] != separator[j]) { + continue itersource; + } + } + // all separator elements matched + if (limit > 0 && segments.size() >= (limit-1)) { + // everything else goes in one final segment + break; + } + + segments.add(new Range(start, i)); + start = i + separator.length; + // i will be incremented again in outer for loop + i += separator.length-1; + } + // add in remaining to a final range + if (start <= source.length) { + segments.add(new Range(start, source.length)); + } + return segments; + } + + /** + * Returns a single byte array containing all of the individual component + * arrays separated by the separator array. + * @param separator + * @param components + * @return + */ + public static byte[] join(byte[] separator, byte[]... components) { + if (components == null || components.length == 0) { + return Constants.EMPTY_BYTES; + } + + int finalSize = 0; + if (separator != null) { + finalSize = separator.length * (components.length - 1); + } + for (byte[] comp : components) { + finalSize += comp.length; + } + + byte[] buf = new byte[finalSize]; + int offset = 0; + for (int i=0; i < components.length; i++) { + System.arraycopy(components[i], 0, buf, offset, components[i].length); + offset += components[i].length; + if (i < (components.length-1) && separator != null && separator.length > 0) { + System.arraycopy(separator, 0, buf, offset, separator.length); + offset += separator.length; + } + } + return buf; + } + + /** + * Returns the index (start position) of the first occurrence of the specified + * {@code target} within {@code array} starting at {@code fromIndex} , or + * {@code -1} if there is no such occurrence. + * + *

+ * Returns the lowest index {@code k} such that {@code k >= fromIndex} and + * {@code java.util.Arrays.copyOfRange(array, k, k + target.length)} contains + * exactly the same elements as {@code target}. + * + * @param array + * the array to search for the sequence {@code target} + * @param target + * the array to search for as a sub-sequence of {@code array} + * @param fromIndex + * the index to start the search from in {@code array} + */ + public static int indexOf(byte[] array, byte[] target, int fromIndex) { + + if (array == null || target == null) { + return -1; + } + + // Target cannot be beyond array boundaries + if (fromIndex < 0 || (fromIndex > (array.length - target.length))) { + return -1; + } + + // Empty is assumed to be at the fromIndex of any non-null array (after + // boundary check) + if (target.length == 0) { + return fromIndex; + } + + firstbyte: for (int i = fromIndex; i < array.length - target.length + 1; i++) { + for (int j = 0; j < target.length; j++) { + if (array[i + j] != target[j]) { + continue firstbyte; + } + } + return i; + } + return -1; + } + + /** + * Returns a copy of the source byte array, starting at offset for the given + * length. If the offset + length is out of bounds for the array, returns null. + * @param source + * @param offset + * @param length + * @return + */ + public static byte[] safeCopy(byte[] source, int offset, int length) { + if (length < 0 || source.length < offset+length) { + return null; + } + byte[] copy = new byte[length]; + System.arraycopy(source, offset, copy, 0, length); + return copy; + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/util/DateUtil.java b/hraven-core/src/main/java/com/twitter/hraven/util/DateUtil.java new file mode 100644 index 0000000..cf6560e --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/util/DateUtil.java @@ -0,0 +1,40 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import java.util.Calendar; +import java.util.GregorianCalendar; + +/** + */ +public class DateUtil { + public static final long MONTH_IN_MILLIS = 30L*24*60*60*1000; + + /** + * @return the timestamp (in milliseconds) of baseTimestamp truncate to month start + */ + public static long getMonthStart(long baseTimestamp) { + Calendar cal = new GregorianCalendar(); + cal.setTimeInMillis(baseTimestamp); + // truncate to start of month + cal.set(Calendar.DAY_OF_MONTH, 1); + cal.set(Calendar.HOUR_OF_DAY, 0); + cal.set(Calendar.MINUTE, 0); + cal.set(Calendar.SECOND, 0); + cal.set(Calendar.MILLISECOND, 0); + return cal.getTimeInMillis(); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/util/JSONUtil.java b/hraven-core/src/main/java/com/twitter/hraven/util/JSONUtil.java new file mode 100644 index 0000000..364600f --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/util/JSONUtil.java @@ -0,0 +1,65 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.Writer; + +import org.codehaus.jackson.map.DeserializationConfig; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.map.SerializationConfig; +import org.codehaus.jackson.type.TypeReference; + +import com.twitter.hraven.ClientObjectMapper; +import com.twitter.hraven.rest.ObjectMapperProvider; + +/** + * Helper class used in the rest client. + */ +// TODO: Remove this class. +@Deprecated +public class JSONUtil { + + /** + * Writes object to the writer as JSON using Jackson and adds a new-line before flushing. + * @param writer the writer to write the JSON to + * @param object the object to write as JSON + * @throws IOException if the object can't be serialized as JSON or written to the writer + */ + public static void writeJson(Writer writer, Object object) throws IOException { + ObjectMapper om = ObjectMapperProvider.createCustomMapper(); + + om.configure(SerializationConfig.Feature.INDENT_OUTPUT, true); + om.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); + + writer.write(om.writeValueAsString(object)); + writer.write("\n"); + writer.flush(); + } + + public static void writeJson(String fileName, Object object) throws IOException { + JSONUtil.writeJson(new PrintWriter(fileName), object); + } + + public static Object readJson(InputStream inputStream, TypeReference type) throws IOException { + ObjectMapper om = ClientObjectMapper.createCustomMapper(); + om.getDeserializationConfig().set(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, + false); + return om.readValue(inputStream, type); + } +} diff --git a/hraven-core/src/main/java/com/twitter/hraven/util/StringUtil.java b/hraven-core/src/main/java/com/twitter/hraven/util/StringUtil.java new file mode 100644 index 0000000..aaf1bd1 --- /dev/null +++ b/hraven-core/src/main/java/com/twitter/hraven/util/StringUtil.java @@ -0,0 +1,44 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import com.twitter.hraven.Constants; + +/** + * Utility class for string manipulation. + */ +public class StringUtil { + + private static final String SPACE = " "; + private static final String UNDERSCORE = "_"; + + /** + * Takes a string token to be used as a key or qualifier and cleanses out reserved tokens. This + * operation is not symetrical. Logic is to replace all spaces and exclamation points with + * underscores. + * + * @param token token to cleanse. + * @return + */ + public static String cleanseToken(String token) { + if (token == null || token.length() == 0) { return token; }; + + String cleansed = token.replaceAll(SPACE, UNDERSCORE); + cleansed = cleansed.replaceAll(Constants.SEP, UNDERSCORE); + + return cleansed; + } +} diff --git a/hraven-core/src/main/resources/hadoopclusters.properties b/hraven-core/src/main/resources/hadoopclusters.properties new file mode 100644 index 0000000..4765c45 --- /dev/null +++ b/hraven-core/src/main/resources/hadoopclusters.properties @@ -0,0 +1,4 @@ +#This property file is used to map the jobtracker address from a configuration file to a cluster identifier. +cluster1.identifier1.example.com=cluster1@identifier1 +cluster1.identifier2.example.com=cluster2@identifier2 +hbase-cluster2.identifier2.example.com=hbase-cluster2@identifier2 diff --git a/hraven-core/src/test/java/com/twitter/hraven/AllTests.java b/hraven-core/src/test/java/com/twitter/hraven/AllTests.java new file mode 100644 index 0000000..7f9c047 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/AllTests.java @@ -0,0 +1,40 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.junit.runner.RunWith; +import org.junit.runners.Suite; +import org.junit.runners.Suite.SuiteClasses; + +import com.twitter.hraven.datasource.TestAppVersionService; +import com.twitter.hraven.datasource.TestFlowEventService; +import com.twitter.hraven.datasource.TestFlowQueueKeyConverter; +import com.twitter.hraven.datasource.TestJobHistoryRawService; +import com.twitter.hraven.datasource.TestJobHistoryService; +import com.twitter.hraven.util.TestBatchUtil; +import com.twitter.hraven.util.TestByteUtil; + +@RunWith(Suite.class) +@SuiteClasses({ TestFramework.class, TestJobDescFactoryBase.class, + TestJobId.class, TestJobKey.class, TestJsonSerde.class, + TestPigJobDescFactory.class, TestScaldingJobDescFactory.class, + TestTaskKey.class, TestAppVersionService.class, + TestFlowEventService.class, TestFlowQueueKeyConverter.class, + TestJobHistoryRawService.class, TestJobHistoryService.class, + TestBatchUtil.class, TestByteUtil.class }) +public class AllTests { + +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/GenerateFlowTestData.java b/hraven-core/src/test/java/com/twitter/hraven/GenerateFlowTestData.java new file mode 100644 index 0000000..a41a352 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/GenerateFlowTestData.java @@ -0,0 +1,119 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.util.Bytes; +import com.twitter.hraven.JobHistoryKeys; +import com.twitter.hraven.datasource.JobHistoryByIdService; +import com.twitter.hraven.datasource.JobKeyConverter; + + +/** + * Stores data in job_history table + * also retrieves flow stats + */ +public class GenerateFlowTestData { + @SuppressWarnings("unused") + private static Log LOG = LogFactory.getLog(GenerateFlowTestData.class); + /** Default dummy configuration properties */ + private static Map DEFAULT_CONFIG = new HashMap(); + static { + DEFAULT_CONFIG.put("testproperty1", "value1"); + DEFAULT_CONFIG.put("testproperty2", "value2"); + } + + private int jobIdCnt; + + public void loadFlow(String cluster, String user, String app, long runId, + String version, int jobCount, long baseStats, + JobHistoryByIdService idService, HTable historyTable) + throws Exception { + loadFlow(cluster, user, app, runId, version, jobCount, baseStats, idService, historyTable, + DEFAULT_CONFIG); + } + + public void loadFlow(String cluster, String user, String app, long runId, + String version, int jobCount, long baseStats, + JobHistoryByIdService idService, HTable historyTable, Map config) + throws Exception { + List puts = new ArrayList(jobCount); + JobKeyConverter keyConv = new JobKeyConverter(); + long curTime = 1355614887; + for (int i = 0; i < jobCount; i++) { + String jobId = String.format("job_20120101000000_%04d", jobIdCnt++); + JobKey key = new JobKey(cluster, user, app, runId, jobId); + Put p = new Put(keyConv.toBytes(key)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOBID), + Bytes.toBytes(jobId)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.JOB_STATUS), + Bytes.toBytes("SUCCESS")); + p.add(Constants.INFO_FAM_BYTES, Constants.VERSION_COLUMN_BYTES, Bytes.toBytes(version)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_MAPS), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.TOTAL_REDUCES), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.LAUNCH_TIME), + Bytes.toBytes(curTime)); + p.add(Constants.INFO_FAM_BYTES, JobHistoryKeys.KEYS_TO_BYTES.get(JobHistoryKeys.FINISH_TIME), + Bytes.toBytes( 1000 + curTime)); + p.add(Constants.INFO_FAM_BYTES, Bytes.toBytes("g!FileSystemCounters!HDFS_BYTES_WRITTEN"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, Bytes.toBytes("g!FileSystemCounters!HDFS_BYTES_READ"), + Bytes.toBytes(baseStats) ); + p.add(Constants.INFO_FAM_BYTES, Bytes.toBytes("gr!FileSystemCounters!FILE_BYTES_READ"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, + Bytes.toBytes("gr!org.apache.hadoop.mapred.Task$Counter!REDUCE_SHUFFLE_BYTES"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, Bytes.toBytes("gm!FileSystemCounters!FILE_BYTES_READ"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, Bytes.toBytes("gm!FileSystemCounters!FILE_BYTES_WRITTEN"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, + Bytes.toBytes("g!org.apache.hadoop.mapred.JobInProgress$Counter!SLOTS_MILLIS_MAPS"), + Bytes.toBytes(baseStats)); + p.add(Constants.INFO_FAM_BYTES, + Bytes.toBytes("g!org.apache.hadoop.mapred.JobInProgress$Counter!SLOTS_MILLIS_REDUCES"), + Bytes.toBytes(baseStats)); + + // add some config properties + if (config != null) { + for (Map.Entry entry : config.entrySet()) { + p.add(Constants.INFO_FAM_BYTES, + Bytes.toBytes(Constants.JOB_CONF_COLUMN_PREFIX + Constants.SEP + entry.getKey()), + Bytes.toBytes(entry.getValue())); + } + } + + puts.add(p); + curTime += 1000 ; + + idService.writeIndexes(key); + } + historyTable.put(puts); + } + +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestFramework.java b/hraven-core/src/test/java/com/twitter/hraven/TestFramework.java new file mode 100644 index 0000000..503a43d --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestFramework.java @@ -0,0 +1,59 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import static com.twitter.hraven.Framework.NONE; +import static com.twitter.hraven.Framework.PIG; +import static com.twitter.hraven.Framework.SCALDING; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.assertTrue; + +import org.junit.Test; + +import com.twitter.hraven.Framework; + +/** + * Test {@link Framework} + */ +public class TestFramework { + + /** + * Test going back and forth between code and enum + */ + @Test + public void testGetCode() { + assertEquals(PIG, Framework.get(PIG.getCode())); + assertEquals(SCALDING, Framework.get(SCALDING.getCode())); + assertEquals(NONE, Framework.get(NONE.getCode())); + } + + /** + * Confirm descriptions are not null or empty. + */ + @Test + public void getDescription() { + assertNotNull(PIG.getDescription()); + assertNotNull(SCALDING.getDescription()); + assertNotNull(NONE.getDescription()); + + assertTrue("Description is not expected to be empty", PIG.getDescription().length() > 0); + assertTrue("Description is not expected to be empty", SCALDING.getDescription().length() > 0); + assertTrue("Description is not expected to be empty", NONE.getDescription().length() > 0); + } + + +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactory.java b/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactory.java new file mode 100644 index 0000000..60ea56c --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactory.java @@ -0,0 +1,42 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import org.apache.hadoop.conf.Configuration; +import org.junit.Test; + +import com.twitter.hraven.JobDescFactory; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public class TestJobDescFactory { + @Test + public void testCluster() { + Configuration c = new Configuration(); + c.set(JobDescFactory.JOBTRACKER_KEY, "cluster1.identifier1.example.com:8021"); + String result = JobDescFactory.getCluster(c); + assertNotNull(result); + assertEquals("cluster1@identifier1", result); + + c = new Configuration(); + c.set(JobDescFactory.JOBTRACKER_KEY, "hbase-cluster2.identifier2.example.com:8021"); + result = JobDescFactory.getCluster(c); + assertNotNull(result); + assertEquals("hbase-cluster2@identifier2", result); + + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactoryBase.java b/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactoryBase.java new file mode 100644 index 0000000..321f0dd --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJobDescFactoryBase.java @@ -0,0 +1,45 @@ +package com.twitter.hraven; + +import org.apache.hadoop.conf.Configuration; +import org.junit.Test; +import static junit.framework.Assert.assertEquals; + + +public class TestJobDescFactoryBase extends JobDescFactoryBase { + + public static final String UNSAFE_NAME = "soMe long" + Constants.SEP + "name"; + public static final String SAFE_NAME = "soMe_long_name"; + + /** + * Not interesting for this particular test. + * @param qualifiedJobId + * @param submitTimeMillis + * @param jobConf + * @return + */ + JobDesc create(QualifiedJobId qualifiedJobId, long submitTimeMillis, + Configuration jobConf) { + // Not interesting for this test. + return null; + } + + /** + * @param jobName + * @return + */ + String getAppIdFromJobName(String jobName) { + // Identity transform. + return jobName; + } + + /** + * Test the method to get the app ID from the JobConf. + */ + @Test + public void testgetAppId() { + Configuration conf = new Configuration(); + conf.set(Constants.APP_NAME_CONF_KEY, UNSAFE_NAME); + assertEquals(SAFE_NAME, getAppId(conf)); + } + +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJobHistoryKeys.java b/hraven-core/src/test/java/com/twitter/hraven/TestJobHistoryKeys.java new file mode 100644 index 0000000..51c7328 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJobHistoryKeys.java @@ -0,0 +1,83 @@ +package com.twitter.hraven; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import java.util.HashSet; +import org.junit.Test; + +public class TestJobHistoryKeys { + + private enum test_keys { + JOBTRACKERID, START_TIME, FINISH_TIME, + JOBID, JOBNAME, USER, JOBCONF, SUBMIT_TIME, + LAUNCH_TIME, TOTAL_MAPS, TOTAL_REDUCES, + FAILED_MAPS, FAILED_REDUCES, + FINISHED_MAPS, FINISHED_REDUCES, + JOB_STATUS, TASKID, HOSTNAME, TASK_TYPE, + ERROR, TASK_ATTEMPT_ID, TASK_STATUS, + COPY_PHASE, SORT_PHASE, REDUCE_PHASE, + SHUFFLE_FINISHED, SORT_FINISHED, COUNTERS, + SPLITS, JOB_PRIORITY, HTTP_PORT, + TRACKER_NAME, STATE_STRING, VERSION, + MAP_COUNTERS, REDUCE_COUNTERS, + VIEW_JOB, MODIFY_JOB, JOB_QUEUE; + } + + @Test + public void test_contents() { + + HashSet tk_values = new HashSet(); + for (test_keys tk : test_keys.values()) { + tk_values.add(tk.name()); + } + + for (JobHistoryKeys jhk : JobHistoryKeys.values()) { + assertTrue(tk_values.contains(jhk.name())); + } + } + + @Test + public void test_key_types() { + + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOBTRACKERID), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.START_TIME), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.FINISH_TIME), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOBID), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOBNAME), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.USER), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOBCONF), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.SUBMIT_TIME), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.LAUNCH_TIME), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TOTAL_MAPS), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TOTAL_REDUCES), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.FAILED_MAPS), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.FAILED_REDUCES), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.FINISHED_MAPS), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.FINISHED_REDUCES), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOB_STATUS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TASKID), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.HOSTNAME), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TASK_TYPE), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.ERROR), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TASK_ATTEMPT_ID), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TASK_STATUS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.COPY_PHASE), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.SORT_PHASE), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.REDUCE_PHASE), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.SHUFFLE_FINISHED), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.SORT_FINISHED), Long.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.COUNTERS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.SPLITS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOB_PRIORITY), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.HTTP_PORT), Integer.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.TRACKER_NAME), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.STATE_STRING), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.VERSION), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.MAP_COUNTERS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.REDUCE_COUNTERS), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.VIEW_JOB), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.MODIFY_JOB), String.class); + assertEquals(JobHistoryKeys.KEY_TYPES.get(JobHistoryKeys.JOB_QUEUE), String.class); + } + +} \ No newline at end of file diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJobId.java b/hraven-core/src/test/java/com/twitter/hraven/TestJobId.java new file mode 100644 index 0000000..670e838 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJobId.java @@ -0,0 +1,115 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +import com.twitter.hraven.JobId; +import com.twitter.hraven.datasource.JobIdConverter; + +/** + * Tests for JobId and JobIdConverter + */ +public class TestJobId { + + /** + * Validates job ID parsing and serialization/deserialization to and from bytes. + */ + @Test + public void testSerialization() { + long epoch = 20120101000000L; + String job1 = "job_"+epoch+"_0001"; + String job2 = "job_"+epoch+"_1111"; + String job3 = "job_"+epoch+"_2222"; + String job4 = "job_"+epoch+"_11111"; + + JobId jobId1 = new JobId(job1); + assertEquals(epoch, jobId1.getJobEpoch()); + assertEquals(1L, jobId1.getJobSequence()); + assertEquals(job1, jobId1.getJobIdString()); + + JobId jobId2 = new JobId(job2); + assertEquals(epoch, jobId2.getJobEpoch()); + assertEquals(1111L, jobId2.getJobSequence()); + assertEquals(job2, jobId2.getJobIdString()); + // check Comparable implementation + assertTrue(jobId1.compareTo(jobId2) < 0); + + JobId jobId3 = new JobId(job3); + assertEquals(epoch, jobId3.getJobEpoch()); + assertEquals(2222L, jobId3.getJobSequence()); + assertEquals(job3, jobId3.getJobIdString()); + // check Comparable implementation + assertTrue(jobId2.compareTo(jobId3) < 0); + + JobId jobId4 = new JobId(job4); + assertEquals(epoch, jobId4.getJobEpoch()); + assertEquals(11111L, jobId4.getJobSequence()); + assertEquals(job4, jobId4.getJobIdString()); + // check Comparable implementation + assertTrue(jobId3.compareTo(jobId4) < 0); + + JobIdConverter conv = new JobIdConverter(); + JobId tmp = conv.fromBytes( conv.toBytes(jobId1) ); + assertEquals(jobId1, tmp); + // check hasCode + assertEquals(jobId1.hashCode(), tmp.hashCode()); + tmp = conv.fromBytes( conv.toBytes(jobId2) ); + assertEquals(jobId2, tmp); + assertEquals(jobId2.hashCode(), tmp.hashCode()); + tmp = conv.fromBytes( conv.toBytes(jobId3) ); + assertEquals(jobId3, tmp); + assertEquals(jobId3.hashCode(), tmp.hashCode()); + tmp = conv.fromBytes( conv.toBytes(jobId4) ); + assertEquals(jobId4, tmp); + assertEquals(jobId4.hashCode(), tmp.hashCode()); + } + + /** + * Verifies that JobKey comparisons and byte[] encoded job keys order + * correctly. + */ + @Test + public void testJobIdOrdering() { + String job1 = "job_20120101000000_0001"; + String job2 = "job_20120101000000_1111"; + String job3 = "job_20120101000000_2222"; + String job4 = "job_20120101000000_11111"; + String job5 = "job_20120201000000_0001"; + + JobId jobId1 = new JobId(job1); + JobId jobId2 = new JobId(job2); + JobId jobId3 = new JobId(job3); + JobId jobId4 = new JobId(job4); + JobId jobId5 = new JobId(job5); + + JobIdConverter conv = new JobIdConverter(); + byte[] job1Bytes = conv.toBytes(jobId1); + byte[] job2Bytes = conv.toBytes(jobId2); + byte[] job3Bytes = conv.toBytes(jobId3); + byte[] job4Bytes = conv.toBytes(jobId4); + byte[] job5Bytes = conv.toBytes(jobId5); + + assertTrue(Bytes.compareTo(job1Bytes, job2Bytes) < 0); + assertTrue(Bytes.compareTo(job2Bytes, job3Bytes) < 0); + assertTrue(Bytes.compareTo(job3Bytes, job4Bytes) < 0); + assertTrue(Bytes.compareTo(job4Bytes, job5Bytes) < 0); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJobKey.java b/hraven-core/src/test/java/com/twitter/hraven/TestJobKey.java new file mode 100644 index 0000000..86b36eb --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJobKey.java @@ -0,0 +1,198 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.Framework; +import com.twitter.hraven.JobDesc; +import com.twitter.hraven.JobId; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.datasource.JobIdConverter; +import com.twitter.hraven.datasource.JobKeyConverter; +import com.twitter.hraven.util.ByteUtil; + +/** + * Test the JobKey class. + */ +public class TestJobKey { + private static Log LOG = LogFactory.getLog(TestJobKey.class); + + /** + * Confirm that we can properly serialize and deserialize a JobKey. + */ + @Test + public void testKeySerialization() { + JobKeyConverter conv = new JobKeyConverter(); + JobKey key = new JobKey("cluster1@identifier1", "user1", "app1", 13, "job_20120101235959_0001"); + byte[] keyBytes = conv.toBytes(key); + JobKey key2 = conv.fromBytes(keyBytes); + assertEquals(key.getCluster(), key2.getCluster()); + assertEquals(key.getUserName(), key2.getUserName()); + assertEquals(key.getAppId(), key2.getAppId()); + assertEquals(key.getRunId(), key2.getRunId()); + assertEquals(key.getJobId(), key2.getJobId()); + + // also verify that the runId gets inverted in the serialized byte + // representation + byte[][] keyParts = ByteUtil.split(keyBytes, Constants.SEP_BYTES); + assertEquals(5, keyParts.length); + long encodedRunId = Bytes.toLong(keyParts[3]); + assertEquals(key.getRunId(), Long.MAX_VALUE - encodedRunId); + + // test partial keys + key = new JobKey("c1@local", "user1", "app1", 15, (String)null); + keyBytes = conv.toBytes(key); + key2 = conv.fromBytes(keyBytes); + assertEquals(key.getCluster(), key2.getCluster()); + assertEquals(key.getUserName(), key2.getUserName()); + assertEquals(key.getAppId(), key2.getAppId()); + assertEquals(key.getRunId(), key2.getRunId()); + assertEquals(key.getJobId(), key2.getJobId()); + + // key with no trailing job Id + keyBytes = ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes("c1@local"), + Bytes.toBytes("user1"), + Bytes.toBytes("app1"), + Bytes.toBytes(Long.MAX_VALUE-15L)); + key2 = conv.fromBytes(keyBytes); + assertEquals("c1@local", key2.getCluster()); + assertEquals("user1", key2.getUserName()); + assertEquals("app1", key2.getAppId()); + assertEquals(15L, key2.getRunId()); + assertEquals(0L, key2.getJobId().getJobEpoch()); + assertEquals(0L, key2.getJobId().getJobSequence()); + + // key with empty appId + key = new JobKey("c1@local", "user1", "", 1234L, "job_201206201718_1941"); + keyBytes = conv.toBytes(key); + key2 = conv.fromBytes(keyBytes); + assertKey(key, key2); + } + + public void assertKey(JobKey expected, JobKey actual) { + assertEquals(expected.getCluster(), actual.getCluster()); + assertEquals(expected.getUserName(), actual.getUserName()); + assertEquals(expected.getAppId(), actual.getAppId()); + assertEquals(expected.getRunId(), actual.getRunId()); + assertEquals(expected.getJobId(), actual.getJobId()); + assertEquals(expected.hashCode(),actual.hashCode()); + } + + /** + * Confirm that leading and trailing spaces get ripped off. + */ + @Test + public void testPlainConstructor() { + JobKeyConverter conv = new JobKeyConverter(); + JobKey key = new JobKey("cluster2@identifier2 ", "user2 ", "appSpace ", 17, + " job_20120101235959_1111 "); + byte[] keyBytes = conv.toBytes(key); + JobKey key2 = conv.fromBytes(keyBytes); + assertEquals(key.getUserName(), key2.getUserName()); + assertEquals(key.getAppId(), key2.getAppId()); + assertEquals(key.getRunId(), key2.getRunId()); + assertEquals(key.getJobId(), key2.getJobId()); + + assertEquals("cluster2@identifier2", key.getCluster()); + assertEquals("user2", key.getUserName()); + assertEquals("appSpace", key.getAppId()); + assertEquals(17, key.getRunId()); + assertEquals("job_20120101235959_1111", key.getJobId().getJobIdString()); + } + + /** + * Confirm that leading and trailing spaces get ripped off. + */ + @Test + public void testJobDescConstructor() { + JobKeyConverter conv = new JobKeyConverter(); + JobDesc jobDesc = new JobDesc("cluster2@identifier3 ", "user3 ", + "appSpace ", "spaceVersion3 ", 19, " job_20120101235959_1111 ", Framework.NONE); + JobKey key = new JobKey(jobDesc); + byte[] keyBytes = conv.toBytes(key); + JobKey key2 = conv.fromBytes(keyBytes); + assertEquals(key.getUserName(), key2.getUserName()); + assertEquals(key.getAppId(), key2.getAppId()); + assertEquals(key.getRunId(), key2.getRunId()); + assertEquals(key.getJobId(), key2.getJobId()); + + assertEquals("user3", key.getUserName()); + assertEquals("cluster2@identifier3", key.getCluster()); + assertEquals("appSpace", key.getAppId()); + assertEquals(19, key.getRunId()); + assertEquals("job_20120101235959_1111", key.getJobId().getJobIdString()); + } + + /** + * Checks for correct parsing of job key when run ID may contain the byte + * representation of the separator character. + */ + @Test + public void testEncodedRunId() { + JobKeyConverter conv = new JobKeyConverter(); + long now = System.currentTimeMillis(); + byte[] encoded = Bytes.toBytes(Long.MAX_VALUE - now); + // replace last byte with separator and reconvert to long + Bytes.putBytes(encoded, encoded.length-Constants.SEP_BYTES.length, + Constants.SEP_BYTES, 0, Constants.SEP_BYTES.length); + long badId = Long.MAX_VALUE - Bytes.toLong(encoded); + LOG.info("Bad run ID is "+badId); + + // assemble a job key with the bad run ID + JobIdConverter idConv = new JobIdConverter(); + byte[] encodedKey = ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes("c1@local"), + Bytes.toBytes("user1"), + Bytes.toBytes("app1"), + encoded, + idConv.toBytes(new JobId("job_20120101000000_0001"))); + + JobKey key = conv.fromBytes(encodedKey); + assertEquals("c1@local", key.getCluster()); + assertEquals("user1", key.getUserName()); + assertEquals("app1", key.getAppId()); + assertEquals("job_20120101000000_0001", key.getJobId().getJobIdString()); + } + + @Test + public void testOrdering() { + JobKey key1 = new JobKey("c1@local", "auser", "app", 1234L, "job_20120101000000_0001"); + JobKey key2 = new JobKey("c1@local", "auser", "app", 1234L, "job_20120101000000_2222"); + JobKey key3 = new JobKey("c1@local", "auser", "app", 1234L, "job_20120101000000_11111"); + JobKey key4 = new JobKey("c1@local", "auser", "app", 1345L, "job_20120101000000_0001"); + + JobKeyConverter conv = new JobKeyConverter(); + byte[] key1Bytes = conv.toBytes(key1); + byte[] key2Bytes = conv.toBytes(key2); + byte[] key3Bytes = conv.toBytes(key3); + byte[] key4Bytes = conv.toBytes(key4); + + // highest run ID should sort first + assertTrue(Bytes.compareTo(key4Bytes, key1Bytes) < 0); + // job IDs should sort in numeric order + assertTrue(Bytes.compareTo(key1Bytes, key2Bytes) < 0); + assertTrue(Bytes.compareTo(key2Bytes, key3Bytes) < 0); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestJsonSerde.java b/hraven-core/src/test/java/com/twitter/hraven/TestJsonSerde.java new file mode 100644 index 0000000..ec60015 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestJsonSerde.java @@ -0,0 +1,229 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.twitter.hraven.Flow; +import com.twitter.hraven.JobDetails; +import com.twitter.hraven.datasource.HRavenTestUtil; +import com.twitter.hraven.datasource.JobHistoryByIdService; +import com.twitter.hraven.datasource.JobHistoryService; +import com.twitter.hraven.rest.ObjectMapperProvider; +import com.twitter.hraven.rest.RestJSONResource; +import com.twitter.hraven.rest.SerializationContext; +import com.twitter.hraven.util.JSONUtil; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.client.HTable; +import org.codehaus.jackson.map.ObjectMapper; +import org.codehaus.jackson.map.DeserializationConfig; +import org.codehaus.jackson.map.SerializationConfig; +import org.codehaus.jackson.type.TypeReference; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.util.List; +import java.util.Map; + +import static junit.framework.Assert.assertNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Tests that we can deserialize json and serialize it again and get the same results. Written so + * we can swap in new JSON content from the API and verify that serde still works. + * + */ +@SuppressWarnings("deprecation") +public class TestJsonSerde { + @SuppressWarnings("unused") +private static final Log LOG = LogFactory.getLog(TestJsonSerde.class); + private static HBaseTestingUtility UTIL; + private static HTable historyTable; + private static JobHistoryByIdService idService; + + @BeforeClass + public static void setupBeforeClass() throws Exception { + UTIL = new HBaseTestingUtility(); + UTIL.startMiniCluster(); + HRavenTestUtil.createSchema(UTIL); + historyTable = new HTable(UTIL.getConfiguration(), Constants.HISTORY_TABLE_BYTES); + idService = new JobHistoryByIdService(UTIL.getConfiguration()); + } + + @Test + public void testJsonSerializationFlowStatsJobDetails() throws Exception { + + // load a sample flow + final short numJobsAppOne = 3 ; + final short numJobsAppTwo = 4 ; + final long baseStats = 10L ; + + GenerateFlowTestData flowDataGen = new GenerateFlowTestData(); + flowDataGen.loadFlow("c1@local", "buser", "AppOne", 1234, "a", numJobsAppOne, baseStats,idService, historyTable); + flowDataGen.loadFlow("c2@local", "Muser", "AppTwo", 2345, "b", numJobsAppTwo, baseStats,idService, historyTable); + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + List actualFlows = service.getFlowTimeSeriesStats("c1@local", "buser", "AppOne", "", 0L, 0L, 1000, null); + + // serialize flows into json + ObjectMapper om = ObjectMapperProvider.createCustomMapper(); + om.configure(SerializationConfig.Feature.INDENT_OUTPUT, true); + om.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); + om.configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); + ByteArrayOutputStream f = new ByteArrayOutputStream(); + om.writeValue(f, actualFlows); + ByteArrayInputStream is = new ByteArrayInputStream(f.toByteArray()); + @SuppressWarnings("unchecked") + List deserFlows = (List) JSONUtil.readJson(is, new TypeReference>() {}); + assertFlowDetails(actualFlows, deserFlows); + + } + + @Test + public void testSerializationContext() throws Exception { + + // load a sample flow + final short numJobs = 3 ; + + GenerateFlowTestData flowDataGen = new GenerateFlowTestData(); + // custom config to test out filtering of specific properties + Map fullConfig = Maps.newHashMap(); + fullConfig.put("name", "first"); + fullConfig.put("shortprop", "brief"); + fullConfig.put("longprop", + "an extended bit of text that we will want to filter out from results"); + List serializedKeys = Lists.newArrayList("name", "shortprop"); + + flowDataGen.loadFlow("c1@local", "buser", "testSerializationContext", 1234, "a", numJobs, 10, + idService, historyTable, fullConfig); + + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + Flow actualFlow = service.getFlow("c1@local", "buser", "testSerializationContext", + 1234, false); + assertNotNull(actualFlow); + Configuration actualConfig = actualFlow.getJobs().get(0).getConfiguration(); + assertEquals(fullConfig.get("name"), actualConfig.get("name")); + assertEquals(fullConfig.get("shortprop"), actualConfig.get("shortprop")); + assertEquals(fullConfig.get("longprop"), actualConfig.get("longprop")); + + // test serialization matching specific property keys + // serialize flow into json + RestJSONResource.serializationContext.set( + new SerializationContext(SerializationContext.DetailLevel.EVERYTHING, + new SerializationContext.ConfigurationFilter(serializedKeys))); + ObjectMapper om = ObjectMapperProvider.createCustomMapper(); + om.configure(SerializationConfig.Feature.INDENT_OUTPUT, true); + om.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); + om.configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); + ByteArrayOutputStream f = new ByteArrayOutputStream(); + om.writeValue(f, actualFlow); + ByteArrayInputStream is = new ByteArrayInputStream(f.toByteArray()); + Flow deserFlow = (Flow) JSONUtil.readJson(is, new TypeReference() {}); + assertFlowEquals(actualFlow, deserFlow); + // only config properties in serializedKeys should be present in the deserialized flow + Configuration deserConfig = deserFlow.getJobs().get(0).getConfiguration(); + assertEquals(fullConfig.get("name"), deserConfig.get("name")); + assertEquals(fullConfig.get("shortprop"), deserConfig.get("shortprop")); + // longprop should not have been serialized + assertNull(deserConfig.get("longprop")); + + // test serialization matching property regexes + List patterns = Lists.newArrayList("^.*prop$"); + RestJSONResource.serializationContext.set( + new SerializationContext(SerializationContext.DetailLevel.EVERYTHING, + new SerializationContext.RegexConfigurationFilter(patterns))); + om = ObjectMapperProvider.createCustomMapper(); + om.configure(SerializationConfig.Feature.INDENT_OUTPUT, true); + om.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); + om.configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); + f = new ByteArrayOutputStream(); + om.writeValue(f, actualFlow); + is = new ByteArrayInputStream(f.toByteArray()); + deserFlow = (Flow) JSONUtil.readJson(is, new TypeReference() {}); + assertFlowEquals(actualFlow, deserFlow); + // only config properties in serializedKeys should be present in the deserialized flow + deserConfig = deserFlow.getJobs().get(0).getConfiguration(); + // only 2 *prop keys should be present + assertNull(deserConfig.get("name")); + assertEquals(fullConfig.get("shortprop"), deserConfig.get("shortprop")); + assertEquals(fullConfig.get("longprop"), deserConfig.get("longprop")); + } + + private void assertFlowDetails( List flow1, List flow2) { + assertNotNull(flow1); + assertNotNull(flow2); + assertEquals(flow1.size(), flow2.size()); + assertTrue(flow1.equals(flow2)); + for(int i=0; i< flow1.size();i ++) { + assertFlowEquals(flow1.get(i), flow2.get(i)); + } + } + + private void assertFlowEquals(Flow flow1, Flow flow2) { + assertEquals(flow1.getJobCount(), flow2.getJobCount()); + assertEquals(flow1.getJobs(), flow2.getJobs()); + assertEquals(flow1.getAppId(), flow2.getAppId()); + assertEquals(flow1.getCluster(), flow2.getCluster()); + assertEquals(flow1.getSubmitTime(), flow2.getSubmitTime()); + assertEquals(flow1.getDuration(), flow2.getDuration()); + assertEquals(flow1.getRunId(), flow2.getRunId()); + assertEquals(flow1.getMapSlotMillis(), flow2.getMapSlotMillis()); + assertEquals(flow1.getReduceSlotMillis(), flow2.getReduceSlotMillis()); + assertEquals(flow1.getHdfsBytesRead(), flow2.getHdfsBytesRead()); + assertEquals(flow1.getHdfsBytesWritten(), flow2.getHdfsBytesWritten()); + assertEquals(flow1.getJobGraphJSON(), flow2.getJobGraphJSON()); + assertEquals(flow1.getMapFileBytesRead(), flow2.getMapFileBytesRead()); + assertEquals(flow1.getMapFileBytesWritten(), flow2.getMapFileBytesWritten()); + assertEquals(flow1.getReduceFileBytesRead(), flow2.getReduceFileBytesRead()); + assertEquals(flow1.getTotalMaps(), flow2.getTotalMaps()); + assertEquals(flow1.getTotalReduces(), flow2.getTotalReduces()); + assertEquals(flow1.getVersion(), flow2.getVersion()); + assertEquals(flow1.getUserName(), flow2.getUserName()); + assertJobListEquals(flow1.getJobs(), flow2.getJobs()); + } + + private void assertJobListEquals( List job1, List job2) { + assertNotNull(job1); + assertNotNull(job2); + assertEquals(job1.size(), job2.size()); + + for(int j=0; j desc.getRunId()); + + // check runId setting based on set start time + Configuration c2 = new Configuration(c); + c2.set(Constants.CASCADING_RUN_CONF_KEY, Long.toString(now)); + desc = factory.create(jobId, now, c2); + assertEquals(now, desc.getRunId()); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/TestTaskKey.java b/hraven-core/src/test/java/com/twitter/hraven/TestTaskKey.java new file mode 100644 index 0000000..3d927f4 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/TestTaskKey.java @@ -0,0 +1,82 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import static org.junit.Assert.assertEquals; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.TaskKey; +import com.twitter.hraven.datasource.TaskKeyConverter; + +/** + * Test usage and serialization of TaskKey + */ +public class TestTaskKey { + private static Log LOG = LogFactory.getLog(TestTaskKey.class); + + @Test + public void testSerialization() { + TaskKeyConverter conv = new TaskKeyConverter(); + + TaskKey key1 = new TaskKey( + new JobKey("test@local", "testuser", "app", 1234L, "job_20120101000000_1111"), "m_001"); + assertEquals("test@local", key1.getCluster()); + assertEquals("testuser", key1.getUserName()); + assertEquals("app", key1.getAppId()); + assertEquals(1234L, key1.getRunId()); + assertEquals("job_20120101000000_1111", key1.getJobId().getJobIdString()); + assertEquals("m_001", key1.getTaskId()); + + byte[] key1Bytes = conv.toBytes(key1); + TaskKey key2 = conv.fromBytes(key1Bytes); + assertKey(key1, key2); + + TaskKey key3 = conv.fromBytes( conv.toBytes(key2) ); + assertKey(key1, key3); + + // test with a run ID containing the separator + long now = System.currentTimeMillis(); + byte[] encoded = Bytes.toBytes(Long.MAX_VALUE - now); + // replace last byte with separator and reconvert to long + Bytes.putBytes(encoded, encoded.length-Constants.SEP_BYTES.length, + Constants.SEP_BYTES, 0, Constants.SEP_BYTES.length); + long badId = Long.MAX_VALUE - Bytes.toLong(encoded); + LOG.info("Bad run ID is " + badId); + + TaskKey badKey1 = new TaskKey( + new JobKey(key1.getQualifiedJobId(), key1.getUserName(), key1.getAppId(), badId), + key1.getTaskId()); + byte[] badKeyBytes = conv.toBytes(badKey1); + TaskKey badKey2 = conv.fromBytes(badKeyBytes); + assertKey(badKey1, badKey2); + } + + private void assertKey(TaskKey expected, TaskKey actual) { + assertEquals(expected.getCluster(), actual.getCluster()); + assertEquals(expected.getUserName(), actual.getUserName()); + assertEquals(expected.getAppId(), actual.getAppId()); + assertEquals(expected.getRunId(), actual.getRunId()); + assertEquals(expected.getJobId(), actual.getJobId()); + assertEquals(expected.getTaskId(), actual.getTaskId()); + assertEquals(expected.hashCode(),actual.hashCode()); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/datasource/HRavenTestUtil.java b/hraven-core/src/test/java/com/twitter/hraven/datasource/HRavenTestUtil.java new file mode 100644 index 0000000..5786cf5 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/datasource/HRavenTestUtil.java @@ -0,0 +1,85 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import java.io.IOException; + +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.client.HTable; + +import com.twitter.hraven.Constants; + +/** + * Common utilities to support test cases. + */ +public class HRavenTestUtil { + public static void createSchema(HBaseTestingUtility util) throws IOException { + createHistoryTable(util); + createTaskTable(util); + createHistoryByJobIdTable(util); + createRawTable(util); + createProcessTable(util); + createAppVersionTable(util); + createFlowQueueTable(util); + createFlowEventTable(util); + } + + public static HTable createHistoryTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.HISTORY_TABLE_BYTES, + Constants.INFO_FAM_BYTES); + } + + public static HTable createTaskTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.HISTORY_TASK_TABLE_BYTES, + Constants.INFO_FAM_BYTES); + } + + public static HTable createHistoryByJobIdTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.HISTORY_BY_JOBID_TABLE_BYTES, + Constants.INFO_FAM_BYTES); + } + + public static HTable createRawTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.HISTORY_RAW_TABLE_BYTES, + new byte[][]{Constants.INFO_FAM_BYTES, Constants.RAW_FAM_BYTES}); + } + + public static HTable createProcessTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.JOB_FILE_PROCESS_TABLE_BYTES, + Constants.INFO_FAM_BYTES); + } + + public static HTable createAppVersionTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.HISTORY_APP_VERSION_TABLE_BYTES, + Constants.INFO_FAM_BYTES); + } + + public static HTable createFlowQueueTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.FLOW_QUEUE_TABLE_BYTES, Constants.INFO_FAM_BYTES); + } + + public static HTable createFlowEventTable(HBaseTestingUtility util) + throws IOException { + return util.createTable(Constants.FLOW_EVENT_TABLE_BYTES, Constants.INFO_FAM_BYTES); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/datasource/TestAppVersionService.java b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestAppVersionService.java new file mode 100644 index 0000000..b7d3458 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestAppVersionService.java @@ -0,0 +1,194 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.util.HashSet; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import com.twitter.hraven.Constants; +import com.twitter.hraven.datasource.AppVersionService; +import com.twitter.hraven.datasource.VersionInfo; + +/** + * Test class for {@link AppVersionService} + */ +public class TestAppVersionService { + private static HBaseTestingUtility UTIL; + + private String cluster = "test@local"; + private byte[] clusterBytes = Bytes.toBytes(cluster); + private String user = "testuser"; + private byte[] userBytes = Bytes.toBytes(user); + + @BeforeClass + public static void setupBeforeClass() throws Exception { + UTIL = new HBaseTestingUtility(); + UTIL.startMiniCluster(); + HRavenTestUtil.createAppVersionTable(UTIL); + } + + @Test + public void testAddVersion() throws Exception { + Configuration c = UTIL.getConfiguration(); + AppVersionService service = new AppVersionService(c); + String appId = "addVersion"; + byte[] appIdBytes = Bytes.toBytes(appId); + + byte[] appRow = Bytes.add(Bytes.add(clusterBytes, Constants.SEP_BYTES), + Bytes.add(userBytes, Constants.SEP_BYTES), + appIdBytes); + HTable versionTable = new HTable(c, Constants.HISTORY_APP_VERSION_TABLE); + + try { + service.addVersion(cluster, user, appId, "v1", 1); + Result r = versionTable.get(new Get(appRow)); + assertNotNull(r); + // should have 1 version + assertEquals(r.list().size(), 1); + assertArrayEquals( + r.getValue(Constants.INFO_FAM_BYTES, Bytes.toBytes("v1")), + Bytes.toBytes(1L)); + + service.addVersion(cluster, user, appId, "v2", 10); + r = versionTable.get(new Get(appRow)); + assertNotNull(r); + assertEquals(r.list().size(), 2); + assertArrayEquals( + r.getValue(Constants.INFO_FAM_BYTES, Bytes.toBytes("v1")), + Bytes.toBytes(1L)); + assertArrayEquals( + r.getValue(Constants.INFO_FAM_BYTES, Bytes.toBytes("v2")), + Bytes.toBytes(10L)); + + // add v2 with earlier timestamp + service.addVersion(cluster, user, appId, "v2", 5); + r = versionTable.get(new Get(appRow)); + assertNotNull(r); + assertEquals(r.list().size(), 2); + assertArrayEquals( + r.getValue(Constants.INFO_FAM_BYTES, Bytes.toBytes("v2")), + Bytes.toBytes(5L)); + + // re-add v1 with later timestamp, should ignore + service.addVersion(cluster, user, appId, "v1", 11); + r = versionTable.get(new Get(appRow)); + assertNotNull(r); + assertEquals(r.list().size(), 2); + assertArrayEquals( + r.getValue(Constants.INFO_FAM_BYTES, Bytes.toBytes("v1")), + Bytes.toBytes(1L)); + } finally { + try { + service.close(); + } catch (Exception ignore) { + } + try { + versionTable.close(); + } catch (Exception ignore) { + } + } + } + + @Test + public void testGetLatestVersion() throws Exception { + Configuration c = UTIL.getConfiguration(); + + String appId = "getLatestVersion"; + + AppVersionService service = new AppVersionService(c); + try { + // check adding versions in order + service.addVersion(cluster, user, appId, "v1", 10); + String latest = service.getLatestVersion(cluster, user, appId); + assertEquals("v1", latest); + service.addVersion(cluster, user, appId, "v2", 20); + latest = service.getLatestVersion(cluster, user, appId); + assertEquals("v2", latest); + service.addVersion(cluster, user, appId, "v3", 30); + latest = service.getLatestVersion(cluster, user, appId); + assertEquals("v3", latest); + // latest should not change + service.addVersion(cluster, user, appId, "v2.5", 25); + latest = service.getLatestVersion(cluster, user, appId); + assertEquals("v3", latest); + } finally { + service.close(); + } + } + + @Test + public void testGetDistinctVersions() throws Exception { + Configuration c = UTIL.getConfiguration(); + + { /* + * TEST1 check that empty list is returned when no versions exist + */ + + String appId = "getDistinctVersions"; + AppVersionService service = new AppVersionService(c); + List latest = service.getDistinctVersions(cluster, user, appId); + // expecting nothing (0 versions) + assertEquals(latest.size(), 0); + } + + { /* + * TEST2 Check that only distinct versions are returned, when Multiple + * Versions Exist + */ + String appId = "getDistinctVersions"; + AppVersionService service = new AppVersionService(c); + try { + service.addVersion(cluster, user, appId, "v1", 10); + service.addVersion(cluster, user, appId, "v2", 30); + service.addVersion(cluster, user, appId, "v1", 8390); + service.addVersion(cluster, user, appId, "v1", 90); + service.addVersion(cluster, user, appId, "v1", 80); + + List latest = service.getDistinctVersions(cluster, user, appId); + // expecting two distinct versions + assertEquals(latest.size(), 2); + HashSet expVersions = new HashSet(); + expVersions.add("v1"); + expVersions.add("v2"); + for (int i =0 ; i < latest.size(); i++ ) { + assertTrue( expVersions.contains( latest.get(i).getVersion()) ) ; + } + + } finally { + service.close(); + } + } + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + UTIL.shutdownMiniCluster(); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowEventService.java b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowEventService.java new file mode 100644 index 0000000..f8a2db6 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestFlowEventService.java @@ -0,0 +1,107 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import com.twitter.hraven.FlowEvent; +import com.twitter.hraven.FlowEventKey; +import com.twitter.hraven.FlowKey; +import com.twitter.hraven.Framework; +import com.twitter.hraven.datasource.FlowEventService; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + */ +public class TestFlowEventService { + private static final String TEST_CLUSTER = "test@test"; + private static final String TEST_USER = "testuser"; + private static final String TEST_APP = "TestFlowEventService"; + + private static HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + @BeforeClass + public static void setupBeforeClass() throws Exception { + UTIL.startMiniCluster(); + HRavenTestUtil.createFlowEventTable(UTIL); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testFlowEventReadWrite() throws Exception { + FlowEventService service = new FlowEventService(UTIL.getConfiguration()); + // setup some test data for a couple flows + long flow1Run = System.currentTimeMillis(); + FlowKey flow1Key = new FlowKey(TEST_CLUSTER, TEST_USER, TEST_APP, flow1Run); + List flow1Events = generateEvents(flow1Key, 5); + service.addEvents(flow1Events); + + long flow2Run = flow1Run + 10; + FlowKey flow2Key = new FlowKey(TEST_CLUSTER, TEST_USER, TEST_APP, flow2Run); + List flow2Events = generateEvents(flow2Key, 10); + service.addEvents(flow2Events); + + // verify we get the right events back for each + List flow1Results = service.getFlowEvents(flow1Key); + assertEvents(flow1Events, flow1Results); + List flow2Results = service.getFlowEvents(flow2Key); + assertEvents(flow2Events, flow2Results); + // check partial results + FlowEventKey flow2Last = flow2Events.get(4).getFlowEventKey(); + List flow2PartialResults = service.getFlowEventsSince(flow2Last); + assertEvents(flow2Events.subList(5, flow2Events.size()), flow2PartialResults); + } + + private List generateEvents(FlowKey flowKey, int count) { + List events = new ArrayList(count); + long now = System.currentTimeMillis(); + for (int i=1; i<=count; i++) { + FlowEvent event = new FlowEvent(new FlowEventKey(flowKey, i)); + event.setTimestamp(now+i); + event.setFramework(Framework.PIG); + event.setType("test"); + event.setEventDataJSON("event"+i); + events.add(event); + } + return events; + } + + private void assertEvents(List expected, List actual) { + assertNotNull(actual); + assertEquals(expected.size(), actual.size()); + for (int i=0; i running = service.getFlowsForStatus(TEST_CLUSTER, Flow.Status.RUNNING, 10); + assertNotNull(running); + assertEquals(2, running.size()); + + // results should be in reverse order by timestamp + Flow result1 = running.get(1); + assertFlowEquals(key1, flow1, result1); + Flow result2 = running.get(0); + assertFlowEquals(key2, flow2, result2); + + // move both flows to successful status + FlowQueueKey newKey1 = new FlowQueueKey(key1.getCluster(), Flow.Status.SUCCEEDED, + key1.getTimestamp(), key1.getFlowId()); + service.moveFlow(key1, newKey1); + FlowQueueKey newKey2 = new FlowQueueKey(key2.getCluster(), Flow.Status.SUCCEEDED, + key2.getTimestamp(), key2.getFlowId()); + service.moveFlow(key2, newKey2); + + List succeeded = service.getFlowsForStatus(TEST_CLUSTER, Flow.Status.SUCCEEDED, 10); + assertNotNull(succeeded); + assertEquals(2, succeeded.size()); + // results should still be in reverse order by timestamp + result1 = succeeded.get(1); + assertFlowEquals(newKey1, flow1, result1); + result2 = succeeded.get(0); + assertFlowEquals(newKey2, flow2, result2); + } + + protected void assertFlowEquals(FlowQueueKey expectedKey, Flow expectedFlow, Flow resultFlow) { + assertNotNull(resultFlow.getQueueKey()); + LOG.info("Expected queue key is " + expectedKey); + LOG.info("Result queue key is "+resultFlow.getQueueKey()); + assertTrue(expectedKey.equals(resultFlow.getQueueKey())); + assertEquals(expectedFlow.getJobGraphJSON(), resultFlow.getJobGraphJSON()); + assertEquals(expectedFlow.getFlowName(), resultFlow.getFlowName()); + assertEquals(expectedFlow.getUserName(), resultFlow.getUserName()); + assertEquals(expectedFlow.getProgress(), resultFlow.getProgress()); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryRawService.java b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryRawService.java new file mode 100644 index 0000000..9b09093 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryRawService.java @@ -0,0 +1,240 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +import com.twitter.hraven.JobId; +import com.twitter.hraven.Range; +import com.twitter.hraven.datasource.JobHistoryRawService; +import com.twitter.hraven.util.BatchUtil; + +public class TestJobHistoryRawService { + + private static final String JOB_HISTORY_FILE_NAME = "src/test/resources/done/something.example.com_1337787092259_job_201205231531_256984_userName1_App1"; + + /** + * Normal example + */ + private static final String JOB_HISTORY = "Meta VERSION=\"1\" ." + + "Job JOBID=\"job_201206061540_11222\"" + + "JOBNAME=\"App1:some_project_one_day\"" + + "USER=\"someone\" SUBMIT_TIME=\"1339063492288\"" + "JOBCONF=\""; + + /** + * Submit time at the end of the string, but still includes quote + */ + private static final String JOB_HISTORY2 = "Meta VERSION=\"1\" ." + + "Job JOBID=\"job_201206061540_11222\"" + + "JOBNAME=\"App1:some_project_one_day\"" + + "USER=\"someone\" SUBMIT_TIME=\"1339063492288\""; + + /** + * Submit time is the only thing in the string. + */ + private static final String JOB_HISTORY3 = "SUBMIT_TIME=\"1339063492288\""; + + private static final String BAD_JOB_HISTORY = "SUBMIT_TIME=\""; + + private static final String BAD_JOB_HISTORY2 = "SUBMIT_TIME=\"\""; + + /** + * Missing quote at the end + */ + private static final String BAD_JOB_HISTORY3 = "Meta VERSION=\"1\" ." + + "Job JOBID=\"job_201206061540_11222\"" + + "JOBNAME=\"App1:some_project_one_day\"" + + "USER=\"someone2\" SUBMIT_TIME=\"1339063492288"; + + /** + * Missing start quote + */ + private static final String BAD_JOB_HISTORY4 = "Meta VERSION=\"1\" ." + + "Job JOBID=\"job_201206061540_11222\"" + + "JOBNAME=\"App1:some_project_one_day\"" + + "USER=\"someone3\" SUBMIT_TIME=1339063492288\""; + + /** + * Confirm that we can properly find the submit timestamp. + */ + @Test + public void testGetSubmitTimeMillisFromJobHistory() { + byte[] jobHistoryBytes = Bytes.toBytes(JOB_HISTORY); + long submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(1339063492288L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(JOB_HISTORY2); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(1339063492288L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(JOB_HISTORY3); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(1339063492288L, submitTimeMillis); + + // Now some cases where we should not be able to find any timestamp. + jobHistoryBytes = Bytes.toBytes(""); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(0L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(BAD_JOB_HISTORY); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(0L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(BAD_JOB_HISTORY2); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(0L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(BAD_JOB_HISTORY3); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(0L, submitTimeMillis); + + jobHistoryBytes = Bytes.toBytes(BAD_JOB_HISTORY4); + submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(0L, submitTimeMillis); + } + + /** + * Confirm that we can properly find the submit timestamp. + * + * @throws IOException + */ + @Test + public void testGetSubmitTimeMillisFromJobHistoryFile() throws IOException { + byte[] jobHistoryBytes = null; + + File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + + FileInputStream fis = new FileInputStream(jobHistoryfile); + try { + byte[] buffer = new byte[1024]; + int length = 0; + while ((length = fis.read(buffer)) > 0) { + bos.write(buffer, 0, length); + } + jobHistoryBytes = bos.toByteArray(); + } finally { + fis.close(); + } + + long submitTimeMillis = JobHistoryRawService + .getSubmitTimeMillisFromJobHistory(jobHistoryBytes); + assertEquals(1338958320124L, submitTimeMillis); + } + + /** + * Does not test a specific method, but tests the algorithm used to get + * ranges. + */ + @Test + public void testGetJobIdRanges() { + + long aEpoch = 123456; + long bEpoch = 234567; + + JobId aOne = new JobId(aEpoch, 1); + JobId aTwo = new JobId(aEpoch, 2); + JobId aThree = new JobId(aEpoch, 3); + JobId aSeven = new JobId(aEpoch, 7); + + JobId aThirteen = new JobId(aEpoch, 13); + JobId aHundredOne = new JobId(aEpoch, 101); + JobId bOne = new JobId(bEpoch, 1); + JobId bTwo = new JobId(bEpoch, 2); + + JobId bThree = new JobId(bEpoch, 3); + JobId bSeven = new JobId(bEpoch, 7); + JobId bThirteen = new JobId(bEpoch, 13); + JobId bHundredOne = new JobId(bEpoch, 101); + + SortedSet orderedJobIds = new TreeSet(); + // Add in scrambled order + orderedJobIds.add(bSeven); + orderedJobIds.add(aSeven); + orderedJobIds.add(aThree); + orderedJobIds.add(bThree); + orderedJobIds.add(bThirteen); + orderedJobIds.add(aThirteen); + orderedJobIds.add(aOne); + orderedJobIds.add(bOne); + orderedJobIds.add(aHundredOne); + orderedJobIds.add(bHundredOne); + orderedJobIds.add(aTwo); + orderedJobIds.add(bTwo); + + // And for good measure add these again, set should take them out. + orderedJobIds.add(aTwo); + orderedJobIds.add(bTwo); + + assertEquals(12, orderedJobIds.size()); + + List> ranges = BatchUtil.getRanges(orderedJobIds, 4); + assertEquals(3, ranges.size()); + assertEquals(1, ranges.get(0).getMin().getJobSequence()); + assertEquals(7, ranges.get(0).getMax().getJobSequence()); + assertEquals(aEpoch, ranges.get(0).getMin().getJobEpoch()); + assertEquals(aEpoch, ranges.get(0).getMax().getJobEpoch()); + + assertEquals(13, ranges.get(1).getMin().getJobSequence()); + assertEquals(2, ranges.get(1).getMax().getJobSequence()); + assertEquals(aEpoch, ranges.get(1).getMin().getJobEpoch()); + assertEquals(bEpoch, ranges.get(1).getMax().getJobEpoch()); + + assertEquals(3, ranges.get(2).getMin().getJobSequence()); + assertEquals(101, ranges.get(2).getMax().getJobSequence()); + assertEquals(bEpoch, ranges.get(2).getMin().getJobEpoch()); + assertEquals(bEpoch, ranges.get(2).getMax().getJobEpoch()); + + long cEpoch = 345678; + long triangular = 1000405; + JobId cTriangular = new JobId(cEpoch, triangular); + orderedJobIds.add(cTriangular); + + assertEquals(13, orderedJobIds.size()); + ranges = BatchUtil.getRanges(orderedJobIds, 4); + assertEquals(4, ranges.size()); + assertEquals(triangular, ranges.get(3).getMin().getJobSequence()); + assertEquals(triangular, ranges.get(3).getMax().getJobSequence()); + assertEquals(cEpoch, ranges.get(3).getMin().getJobEpoch()); + assertEquals(cEpoch, ranges.get(3).getMax().getJobEpoch()); + + ranges = BatchUtil.getRanges(orderedJobIds, 1000); + assertEquals(1, ranges.size()); + assertEquals(1, ranges.get(0).getMin().getJobSequence()); + assertEquals(triangular, ranges.get(0).getMax().getJobSequence()); + assertEquals(aEpoch, ranges.get(0).getMin().getJobEpoch()); + assertEquals(cEpoch, ranges.get(0).getMax().getJobEpoch()); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryService.java b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryService.java new file mode 100644 index 0000000..d56c30a --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/datasource/TestJobHistoryService.java @@ -0,0 +1,302 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.datasource; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.client.HTable; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.Flow; +import com.twitter.hraven.GenerateFlowTestData; +import com.twitter.hraven.JobDetails; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.datasource.JobHistoryByIdService; +import com.twitter.hraven.datasource.JobHistoryService; +import com.twitter.hraven.datasource.HRavenTestUtil; + +/** + * Round-trip testing for storage and retrieval of data in job_history table. + * + */ +public class TestJobHistoryService { + private static Log LOG = LogFactory.getLog(TestJobHistoryService.class); + private static HBaseTestingUtility UTIL; + private static HTable historyTable; + private static JobHistoryByIdService idService; + private static GenerateFlowTestData flowDataGen ; + + @BeforeClass + public static void setupBeforeClass() throws Exception { + UTIL = new HBaseTestingUtility(); + UTIL.startMiniCluster(); + HRavenTestUtil.createSchema(UTIL); + historyTable = new HTable(UTIL.getConfiguration(), Constants.HISTORY_TABLE_BYTES); + idService = new JobHistoryByIdService(UTIL.getConfiguration()); + flowDataGen = new GenerateFlowTestData(); + + } + + @Test + public void testJobHistoryRead() throws Exception { + // load some initial data + // a few runs of the same app + + flowDataGen.loadFlow("c1@local", "buser", "app1", 1234, "a", 3, 10,idService, historyTable); + flowDataGen.loadFlow("c1@local", "buser", "app1", 1345, "a", 3, 10,idService, historyTable); + flowDataGen.loadFlow("c1@local", "buser", "app1", 1456, "a", 3, 10,idService, historyTable); + + flowDataGen.loadFlow("c1@local", "buser", "app2", 1212, "a", 1, 10,idService, historyTable); + + flowDataGen.loadFlow("c1@local", "fuser", "app1", 2345, "a", 2, 10,idService, historyTable); + flowDataGen.loadFlow("c1@local", "fuser", "app1", 2456, "b", 2, 10,idService, historyTable); + + // read out job history flow directly + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + try { + Flow flow = service.getLatestFlow("c1@local", "buser", "app1"); + assertNotNull(flow); + assertEquals(3, flow.getJobs().size()); + for (JobDetails j : flow.getJobs()) { + JobKey k = j.getJobKey(); + assertEquals("c1@local", k.getCluster()); + assertEquals("buser", k.getUserName()); + assertEquals("app1", k.getAppId()); + assertEquals(1456L, k.getRunId()); + assertEquals("a", j.getVersion()); + } + + List flowSeries = service.getFlowSeries("c1@local", "buser", "app1", 100); + assertNotNull(flowSeries); + assertEquals(3, flowSeries.size()); + for (Flow f : flowSeries) { + for (JobDetails j : f.getJobs()) { + JobKey k = j.getJobKey(); + assertEquals(f.getCluster(), k.getCluster()); + assertEquals(f.getUserName(), k.getUserName()); + assertEquals(f.getAppId(), k.getAppId()); + assertEquals(f.getRunId(), k.getRunId()); + } + } + + flowSeries = service.getFlowSeries("c1@local", "buser", "app2", 100); + assertNotNull(flowSeries); + assertEquals(1, flowSeries.size()); + Flow first = flowSeries.get(0); + assertEquals(1, first.getJobs().size()); + JobDetails firstJob = first.getJobs().get(0); + assertEquals("c1@local", firstJob.getJobKey().getCluster()); + assertEquals("buser", firstJob.getJobKey().getUserName()); + assertEquals("app2", firstJob.getJobKey().getAppId()); + assertEquals(1212L, firstJob.getJobKey().getRunId()); + + flowSeries = service.getFlowSeries("c1@local", "fuser", "app1", 100); + assertNotNull(flowSeries); + assertEquals(2, flowSeries.size()); + Flow f1 = flowSeries.get(0); + assertEquals(2, f1.getJobs().size()); + assertEquals("fuser", f1.getUserName()); + assertEquals("app1", f1.getAppId()); + for (JobDetails j : f1.getJobs()) { + assertEquals(2456L, j.getJobKey().getRunId()); + assertEquals("b", j.getVersion()); + } + Flow f2 = flowSeries.get(1); + assertEquals(2, f2.getJobs().size()); + assertEquals("fuser", f2.getUserName()); + assertEquals("app1", f2.getAppId()); + for (JobDetails j : f2.getJobs()) { + assertEquals(2345L, j.getJobKey().getRunId()); + assertEquals("a", j.getVersion()); + } + + // test reading job history flow by job ID + String jobId = f2.getJobs().get(0).getJobId(); + Flow f2FromId = service.getFlowByJobID("c1@local", jobId, false); + assertNotNull(f2FromId); + assertEquals(f2.getCluster(), f2FromId.getCluster()); + assertEquals(f2.getUserName(), f2FromId.getUserName()); + assertEquals(f2.getAppId(), f2FromId.getAppId()); + assertEquals(f2.getRunId(), f2FromId.getRunId()); + assertEquals(f2.getJobs().size(), f2FromId.getJobs().size()); + for (int i=0; i versionSeries = service.getFlowSeries("c1@local", "fuser", "app1", "a", false, 100); + assertNotNull(versionSeries); + assertEquals(1, versionSeries.size()); + for (JobDetails j : versionSeries.get(0).getJobs()) { + assertEquals(2345L, j.getJobKey().getRunId()); + assertEquals("a", j.getVersion()); + } + } finally { + service.close(); + } + } + + @Test + public void testGetJobByJobID() throws Exception { + // load a sample flow + flowDataGen.loadFlow("c1@local", "buser", "getJobByJobID", 1234, "a", 3, 10, + idService, historyTable); + + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + try { + // fetch back the entire flow + Flow flow = service.getLatestFlow("c1@local", "buser", "getJobByJobID"); + assertNotNull(flow); + assertEquals(3, flow.getJobs().size()); + // for each job in the flow, validate that we can retrieve it individually + for (JobDetails j : flow.getJobs()) { + JobKey key = j.getJobKey(); + JobDetails j2 = service.getJobByJobID(key.getQualifiedJobId(), false); + assertJob(j, j2); + } + } finally { + service.close(); + } + } + + @Test + public void testGetFlowTimeSeriesStats() throws Exception { + + // load a sample flow + final short numJobsAppOne = 3 ; + final short numJobsAppTwo = 4 ; + final long baseStats = 10L ; + + flowDataGen.loadFlow("c1@local", "buser", "AppOne", 1234, "a", numJobsAppOne, baseStats, + idService, historyTable); + flowDataGen.loadFlow("c1@local", "buser", "AppTwo", 2345, "b", numJobsAppTwo, baseStats, + idService, historyTable); + + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + try { + // fetch back the entire flow stats + List flowSeries = service.getFlowTimeSeriesStats("c1@local", "buser", "AppOne", "", 0L, 0L, 1000, null); + assertNotNull(flowSeries); + for ( Flow f : flowSeries ){ + assertEquals( numJobsAppOne, f.getJobCount()); + assertEquals( numJobsAppOne * baseStats , f.getTotalMaps()); + assertEquals( numJobsAppOne * baseStats , f.getTotalReduces()); + assertEquals( numJobsAppOne * baseStats , f.getHdfsBytesRead()); + assertEquals( numJobsAppOne * baseStats , f.getHdfsBytesWritten()); + assertEquals( numJobsAppOne * baseStats , f.getMapFileBytesRead()); + assertEquals( numJobsAppOne * baseStats , f.getMapFileBytesWritten()); + assertEquals( numJobsAppOne * baseStats , f.getMapSlotMillis()); + assertEquals( numJobsAppOne * baseStats , f.getReduceFileBytesRead()); + assertEquals( numJobsAppOne * baseStats , f.getReduceShuffleBytes()); + assertEquals( numJobsAppOne * baseStats , f.getReduceSlotMillis()); + assertEquals( "a" , f.getVersion()); + assertEquals( numJobsAppOne * 1000, f.getDuration()); + // verify that job configurations are empty + for (JobDetails job : f.getJobs()) { + assertEquals(0, job.getConfiguration().size()); + } + } + + flowSeries = service.getFlowTimeSeriesStats("c1@local", "buser", "AppTwo", "", 0L, 0L, 1000, null); + assertNotNull(flowSeries); + for ( Flow f : flowSeries ){ + assertEquals( numJobsAppTwo, f.getJobCount()); + assertEquals( numJobsAppTwo * baseStats , f.getTotalMaps()); + assertEquals( numJobsAppTwo * baseStats , f.getTotalReduces()); + assertEquals( numJobsAppTwo * baseStats , f.getHdfsBytesRead()); + assertEquals( numJobsAppTwo * baseStats , f.getHdfsBytesWritten()); + assertEquals( numJobsAppTwo * baseStats , f.getMapFileBytesRead()); + assertEquals( numJobsAppTwo * baseStats , f.getMapFileBytesWritten()); + assertEquals( numJobsAppTwo * baseStats , f.getMapSlotMillis()); + assertEquals( numJobsAppTwo * baseStats , f.getReduceFileBytesRead()); + assertEquals( numJobsAppTwo * baseStats , f.getReduceShuffleBytes()); + assertEquals( numJobsAppTwo * baseStats , f.getReduceSlotMillis()); + assertEquals( "b" , f.getVersion()); + assertEquals( numJobsAppTwo * 1000, f.getDuration()); + } + } finally { + service.close(); + } + } + + @Test + public void testRemoveJob() throws Exception { + // load a sample flow + flowDataGen.loadFlow("c1@local", "ruser", "removeJob", 1234, "a", 3, 10,idService, historyTable); + + JobHistoryService service = new JobHistoryService(UTIL.getConfiguration()); + try { + // fetch back the entire flow + Flow flow = service.getLatestFlow("c1@local", "ruser", "removeJob"); + assertNotNull(flow); + assertEquals(3, flow.getJobs().size()); + + // remove the first job + List origJobs = flow.getJobs(); + JobDetails toRemove = origJobs.get(0); + // drop the the collection so we can compare remaining + origJobs.remove(0); + LOG.info("Removing job "+toRemove.getJobKey()); + service.removeJob(toRemove.getJobKey()); + + Flow flow2 = service.getLatestFlow("c1@local", "ruser", "removeJob"); + assertNotNull(flow2); + assertEquals(2, flow2.getJobs().size()); + for (JobDetails j : flow2.getJobs()) { + if (j.getJobKey().equals(toRemove.getJobKey())) { + fail("Removed job ("+toRemove.getJobKey()+") is still present in flow!"); + } + } + + // remaining jobs in the flow should match + List flow2Jobs = flow2.getJobs(); + assertEquals(origJobs.size(), flow2Jobs.size()); + for (int i=0; i < origJobs.size(); i++) { + JobDetails j1 = origJobs.get(i); + JobDetails j2 = flow2Jobs.get(i); + assertJob(j1, j2); + } + // TODO: validate deletion of task rows + } finally { + service.close(); + } + } + + private void assertJob(JobDetails expected, JobDetails actual) { + assertNotNull(actual); + assertEquals(expected.getJobKey(), actual.getJobKey()); + assertEquals(expected.getJobId(), actual.getJobId()); + assertEquals(expected.getStatus(), actual.getStatus()); + assertEquals(expected.getVersion(), actual.getVersion()); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + UTIL.shutdownMiniCluster(); + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/rest/TestPaginatedResult.java b/hraven-core/src/test/java/com/twitter/hraven/rest/TestPaginatedResult.java new file mode 100644 index 0000000..c9fc68b --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/rest/TestPaginatedResult.java @@ -0,0 +1,76 @@ +package com.twitter.hraven.rest; +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.assertNull; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +/** + * Unit tests for the PaginatedResult class + */ + +public class TestPaginatedResult { + + private final int INTEGER_PAGE_LIMIT = 10; + + @Test + public void checkOnePageResults() { + PaginatedResult pageOfInts = new PaginatedResult(INTEGER_PAGE_LIMIT); + assertNotNull(pageOfInts); + assertEquals(pageOfInts.getLimit(), INTEGER_PAGE_LIMIT); + List actualValues = new ArrayList(); + populateListOfInts(actualValues, INTEGER_PAGE_LIMIT); + pageOfInts.setValues(actualValues); + List expectedValues = new ArrayList(); + populateListOfInts(expectedValues, INTEGER_PAGE_LIMIT); + assertEquals(actualValues.size(), pageOfInts.getLimit()); + assertEquals(expectedValues.size(), pageOfInts.getLimit()); + assertNull(pageOfInts.getNextStartRow()); + assertEquals(expectedValues, pageOfInts.getValues()); + } + + @Test + public void checkMultiplePageResults() { + final int EXTRA_RESULTS = 1; + final Integer NEXT_START_ROW = (INTEGER_PAGE_LIMIT + 1) * INTEGER_PAGE_LIMIT; + PaginatedResult pageOfInts = new PaginatedResult(INTEGER_PAGE_LIMIT); + assertNotNull(pageOfInts); + assertEquals(pageOfInts.getLimit(), INTEGER_PAGE_LIMIT); + List actualValues = new ArrayList(); + populateListOfInts(actualValues, INTEGER_PAGE_LIMIT + EXTRA_RESULTS); + pageOfInts.setValues(actualValues.subList(0, INTEGER_PAGE_LIMIT)); + List expectedValues = new ArrayList(); + populateListOfInts(expectedValues, INTEGER_PAGE_LIMIT); + pageOfInts.setNextStartRow(Bytes.toBytes(actualValues.get( INTEGER_PAGE_LIMIT))); + assertEquals(actualValues.size(), pageOfInts.getLimit() + EXTRA_RESULTS); + assertEquals(expectedValues.size(), pageOfInts.getLimit()); + assertNotNull(pageOfInts.getNextStartRow()); + assertEquals(NEXT_START_ROW.intValue(), Bytes.toInt(pageOfInts.getNextStartRow())); + assertEquals(expectedValues, pageOfInts.getValues()); + } + + @Test + public void checkLessThanOnePageResults() { + final int LESS_THAN_ONE_PAGE = INTEGER_PAGE_LIMIT / 2; + PaginatedResult pageOfInts = new PaginatedResult(INTEGER_PAGE_LIMIT); + assertNotNull(pageOfInts); + assertEquals(pageOfInts.getLimit(), INTEGER_PAGE_LIMIT); + List actualValues = new ArrayList(); + populateListOfInts(actualValues, LESS_THAN_ONE_PAGE); + pageOfInts.setValues(actualValues); + List expectedValues = new ArrayList(); + populateListOfInts(expectedValues, LESS_THAN_ONE_PAGE); + assertEquals(LESS_THAN_ONE_PAGE, pageOfInts.getValues().size()); + assertNull(pageOfInts.getNextStartRow()); + assertEquals(expectedValues, pageOfInts.getValues()); + + } + + private void populateListOfInts(List inputValues, int limit) { + for (int i = 1; i <= limit; i++) { + inputValues.add(i * INTEGER_PAGE_LIMIT); + } + } +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/util/TestBatchUtil.java b/hraven-core/src/test/java/com/twitter/hraven/util/TestBatchUtil.java new file mode 100644 index 0000000..293523e --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/util/TestBatchUtil.java @@ -0,0 +1,114 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.List; + +import org.junit.Test; + +import com.twitter.hraven.Range; +import com.twitter.hraven.util.BatchUtil; + +/** + *Test for the JobFilePartitioner class. + * + */ +public class TestBatchUtil { + + /** + * Test {@link BatchUtil#shouldRetain(int, int, int)} method. + */ + @Test + public void testShouldRetain() { + assertTrue(BatchUtil.shouldRetain(0, 1, 1)); + assertTrue(BatchUtil.shouldRetain(5, 5, 10)); + + assertFalse(BatchUtil.shouldRetain(0, 1, 2)); + assertFalse(BatchUtil.shouldRetain(4, 5, 10)); + assertFalse(BatchUtil.shouldRetain(4, 100000, 155690)); + } + + /** + * Test {@link BatchUtil#getBatchCount(int, int)} method. + */ + @Test + public void testGetBatchCount() { + // Edge cases + assertEquals(0, BatchUtil.getBatchCount(0,0)); + assertEquals(0, BatchUtil.getBatchCount(-1,0)); + assertEquals(0, BatchUtil.getBatchCount(-2,4)); + assertEquals(0, BatchUtil.getBatchCount(5,-7)); + + + // One + assertEquals(1, BatchUtil.getBatchCount(9,9)); + assertEquals(1, BatchUtil.getBatchCount(9,10)); + assertEquals(1, BatchUtil.getBatchCount(9,11)); + assertEquals(1, BatchUtil.getBatchCount(9,18)); + assertEquals(1, BatchUtil.getBatchCount(9,19)); + + // More + assertEquals(2, BatchUtil.getBatchCount(9,8)); + assertEquals(2, BatchUtil.getBatchCount(9,7)); + assertEquals(2, BatchUtil.getBatchCount(9,6)); + assertEquals(2, BatchUtil.getBatchCount(9,5)); + assertEquals(3, BatchUtil.getBatchCount(9,4)); + assertEquals(3, BatchUtil.getBatchCount(9,3)); + assertEquals(5, BatchUtil.getBatchCount(9,2)); + assertEquals(9, BatchUtil.getBatchCount(9,1)); + } + + /** + * Confirm that getting ranges works correctly. + */ + @Test + public void testGetRanges() { + + List list = Arrays.asList(1,2,3); + List> rangeList = BatchUtil.getRanges(list, 1); + assertEquals(3, rangeList.size()); + assertEquals(Integer.valueOf(1), rangeList.get(0).getMin()); + assertEquals(Integer.valueOf(1), rangeList.get(0).getMax()); + assertEquals(Integer.valueOf(2), rangeList.get(1).getMin()); + assertEquals(Integer.valueOf(2), rangeList.get(1).getMax()); + assertEquals(Integer.valueOf(3), rangeList.get(2).getMin()); + assertEquals(Integer.valueOf(3), rangeList.get(2).getMax()); + + + list = Arrays.asList(1,2,3,4,5,6,7,8,9,10); + rangeList = BatchUtil.getRanges(list, 3); + assertEquals(4, rangeList.size()); + assertEquals(Integer.valueOf(1), rangeList.get(0).getMin()); + assertEquals(Integer.valueOf(3), rangeList.get(0).getMax()); + assertEquals(Integer.valueOf(4), rangeList.get(1).getMin()); + assertEquals(Integer.valueOf(6), rangeList.get(1).getMax()); + assertEquals(Integer.valueOf(7), rangeList.get(2).getMin()); + assertEquals(Integer.valueOf(9), rangeList.get(2).getMax()); + assertEquals(Integer.valueOf(10), rangeList.get(3).getMin()); + assertEquals(Integer.valueOf(10), rangeList.get(3).getMax()); + + rangeList = BatchUtil.getRanges(list, 17); + assertEquals(1, rangeList.size()); + assertEquals(Integer.valueOf(1), rangeList.get(0).getMin()); + assertEquals(Integer.valueOf(10), rangeList.get(0).getMax()); + } + +} diff --git a/hraven-core/src/test/java/com/twitter/hraven/util/TestByteUtil.java b/hraven-core/src/test/java/com/twitter/hraven/util/TestByteUtil.java new file mode 100644 index 0000000..7d20a34 --- /dev/null +++ b/hraven-core/src/test/java/com/twitter/hraven/util/TestByteUtil.java @@ -0,0 +1,267 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.util; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + +import java.util.List; + +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.util.ByteUtil; + +/** + */ +public class TestByteUtil { + private byte[] source1 = Bytes.toBytes("abc!bcd!cde"); + byte[] sep1 = Bytes.toBytes("!"); + + byte[] source2 = Bytes.toBytes("random::stuff::"); + byte[] sep2 = Bytes.toBytes("::"); + + byte[] source3 = Bytes.toBytes("::more::stuff::"); + byte[] sep3 = Bytes.toBytes("::"); + + byte[] source4 = Bytes.toBytes("singlesource"); + byte[] sep4 = Bytes.toBytes("::"); + + byte[] source5 = Bytes.toBytes("abc!!bcd"); + byte[] sep5 = Bytes.toBytes("!"); + + byte[] source6 = Bytes.toBytes("single:source"); + byte[] sep6 = Bytes.toBytes("::"); + + @Test + public void testSplit() { + // single byte separator + byte[][] expected1 = Bytes + .toByteArrays(new String[] { "abc", "bcd", "cde" }); + + byte[][] splitresult = ByteUtil.split(source1, sep1); + assertSplitResult(expected1, splitresult); + + // multi-byte separator, plus trailing separator + byte[][] expected2 = Bytes.toByteArrays(new String[] { "random", "stuff", + "" }); + splitresult = ByteUtil.split(source2, sep2); + assertSplitResult(expected2, splitresult); + + // leading and trailing separator + byte[][] expected3 = Bytes.toByteArrays(new String[] { "", "more", "stuff", + "" }); + splitresult = ByteUtil.split(source3, sep3); + assertSplitResult(expected3, splitresult); + + // source with no separator + byte[][] expected4 = Bytes.toByteArrays(new String[] { "singlesource" }); + splitresult = ByteUtil.split(source4, sep4); + assertSplitResult(expected4, splitresult); + + // source with empty component + byte[][] expected5 = + Bytes.toByteArrays(new String[]{ "abc", "", "bcd"}); + splitresult = ByteUtil.split(source5, sep5); + assertSplitResult(expected5, splitresult); + + byte[][] expected6 = new byte[][] {source6}; + splitresult = ByteUtil.split(source6, sep6); + assertSplitResult(expected6, splitresult); + } + + @Test + public void testSplitWithLimit() { + // source with more splits than the limit + byte[][] expectedResult = Bytes.toByteArrays(new String[] {"random", "stuff::"}); + byte[][] splitResult = ByteUtil.split(source2, sep2, 2); + assertSplitResult(expectedResult, splitResult); + + // source with fewer splits than the limit + expectedResult = Bytes.toByteArrays(new String[] {"random", "stuff", ""}); + splitResult = ByteUtil.split(source2, sep2, 100); + assertSplitResult(expectedResult, splitResult); + + // source with limit of 1 + expectedResult = new byte[][] {source2}; + splitResult = ByteUtil.split(source2, sep2, 1); + assertSplitResult(expectedResult, splitResult); + } + + private void assertSplitResult(byte[][] expectedResult, byte[][] actualResult) { + assertEquals(expectedResult.length, actualResult.length); + for (int i = 0; i < actualResult.length; i++) { + assertArrayEquals("Result " + i + " should match", expectedResult[i], + actualResult[i]); + } + } + + @Test + public void testSplitRanges() { + // test basic range sanity checking + try { + new ByteUtil.Range(-1, 1); + fail("Should have failed with start < 0"); + } catch (IllegalArgumentException expected) { + } + try { + new ByteUtil.Range(2, 1); + fail("Should have failed with end < start"); + } catch (IllegalArgumentException expected) { + } + + List ranges1 = ByteUtil.splitRanges(source1, sep1); + assertEquals("source1 should have 3 segments", 3, ranges1.size()); + assertEquals(0, ranges1.get(0).start()); + assertEquals(3, ranges1.get(0).length()); + assertEquals(4, ranges1.get(1).start()); + assertEquals(3, ranges1.get(1).length()); + assertEquals(8, ranges1.get(2).start()); + assertEquals(3, ranges1.get(2).length()); + + List ranges4 = ByteUtil.splitRanges(source4, sep4); + assertEquals("source4 should be a single segment", 1, ranges4.size()); + assertEquals(0, ranges4.get(0).start()); + assertEquals(source4.length, ranges4.get(0).length()); + + List ranges5 = ByteUtil.splitRanges(source5, sep5); + assertEquals(3, ranges5.size()); + assertEquals(0, ranges5.get(0).start()); + assertEquals(3, ranges5.get(0).end()); + assertEquals(4, ranges5.get(1).start()); + assertEquals(4, ranges5.get(1).end()); + assertEquals(5, ranges5.get(2).start()); + assertEquals(8, ranges5.get(2).end()); + } + + @Test + public void testJoin() { + byte[] comp1 = Bytes.toBytes("abc"); + byte[] comp2 = Bytes.toBytes("def"); + byte[] comp3 = Bytes.toBytes("ghi"); + + // test empty case + byte[] joined = ByteUtil.join(Constants.SEP_BYTES); + assertNotNull(joined); + assertEquals(0, joined.length); + + // test no separator + joined = ByteUtil.join(null, comp1, comp2, comp3); + assertNotNull(joined); + assertArrayEquals(Bytes.toBytes("abcdefghi"), joined); + + // test normal case + joined = ByteUtil.join(Constants.SEP_BYTES, comp1, comp2, comp3); + assertNotNull(joined); + assertArrayEquals( + Bytes.toBytes("abc"+Constants.SEP+"def"+Constants.SEP+"ghi"), joined); + } + + /** + * + */ + @Test + public void testIndexOf() { + + byte[] array = Bytes.toBytes("quackattack"); + byte[] a = Bytes.toBytes("a"); + byte[] ack = Bytes.toBytes("ack"); + byte[] empty = Bytes.toBytes(""); + + int index = ByteUtil.indexOf(array, null, 0); + assertEquals(-1, index); + + index = ByteUtil.indexOf(null, ack, 0); + assertEquals(-1, index); + + index = ByteUtil.indexOf(null, ack, 1); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, ack, 100); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, ack, 100); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, a, array.length + 1); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, ack, array.length + 1); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, empty, array.length + 1); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, empty, 100); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, ack, -3); + assertEquals(-1, index); + + index = ByteUtil.indexOf(array, empty, -3); + assertEquals(-1, index); + + // Empty array should be at the startIndex + index = ByteUtil.indexOf(array, empty, 0); + assertEquals(0, index); + + index = ByteUtil.indexOf(array, empty, 4); + assertEquals(4, index); + + // Normal cases + assertIndexOf(0, array, empty, 0); + assertIndexOf(1, array, empty, 1); + assertIndexOf(3, array, empty, 3); + assertIndexOf(5, array, empty, 5); + assertIndexOf(11, array, empty, 11); + + assertIndexOf(2, array, a, 0); + assertIndexOf(2, array, a, 1); + assertIndexOf(2, array, a, 2); + assertIndexOf(5, array, a, 3); + assertIndexOf(5, array, a, 4); + + assertIndexOf(2, array, ack, 0); + assertIndexOf(2, array, ack, 1); + assertIndexOf(2, array, ack, 2); + assertIndexOf(8, array, ack, 3); + assertIndexOf(8, array, ack, 4); + assertIndexOf(8, array, ack, 8); + } + + /** + * @param expectedIndex + * where the index is expected to be + * @param array + * to search through + * @param target + * to search for + * @param fromIndex + * to start search from + */ + private static void assertIndexOf(int expectedIndex, byte[] array, + byte[] target, int fromIndex) { + int index = ByteUtil.indexOf(array, target, fromIndex); + assertEquals(expectedIndex, index); + byte[] sub = java.util.Arrays.copyOfRange(array, index, index + + target.length); + assertEquals(0, Bytes.compareTo(target, sub)); + } + +} diff --git a/hraven-core/src/test/resources/done/something.example.com_1337787092259_job_201205231531_256984_userName1_App1 b/hraven-core/src/test/resources/done/something.example.com_1337787092259_job_201205231531_256984_userName1_App1 new file mode 100644 index 0000000..9dfed86 --- /dev/null +++ b/hraven-core/src/test/resources/done/something.example.com_1337787092259_job_201205231531_256984_userName1_App1 @@ -0,0 +1,4 @@ +Meta VERSION="1" . +Job JOBID="job_201205231531_256984" JOBNAME="pqrs:abc_something:xyz" USER="user1234" SUBMIT_TIME="1338958320124" JOBCONF="hdfs://something\.example\.com/user/user1234/\.staging/job_201205231531_256984/job\.xml" VIEW_JOB="*" MODIFY_JOB="*" JOB_QUEUE="default" . +Job JOBID="job_201205231531_256984" JOB_PRIORITY="NORMAL" . +Job JOBID="job_201205231531_256984" JOB_STATUS="RUNNING" . diff --git a/hraven-core/src/test/resources/log4j.properties b/hraven-core/src/test/resources/log4j.properties new file mode 100644 index 0000000..96df477 --- /dev/null +++ b/hraven-core/src/test/resources/log4j.properties @@ -0,0 +1,20 @@ +log4j.rootCategory=INFO,console + +# +# console +# Add "console" to rootlogger above if you want to use this +# +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d %-5p [%t] %C{2}(%L): %m%n + +# Custom Logging levels + +#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG + +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.zookeeper=ERROR +log4j.logger.org.apache.hadoop.hbase=INFO + +log4j.logger.com.twitter.hraven=DEBUG diff --git a/hraven-etl/.settings/org.eclipse.jdt.core.prefs b/hraven-etl/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..14284d7 --- /dev/null +++ b/hraven-etl/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +#Thu May 16 09:59:33 PDT 2013 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.source=1.6 +org.eclipse.jdt.core.compiler.compliance=1.6 diff --git a/hraven-etl/pom.xml b/hraven-etl/pom.xml new file mode 100644 index 0000000..72ea6dc --- /dev/null +++ b/hraven-etl/pom.xml @@ -0,0 +1,410 @@ + + + + + + 4.0.0 + + com.twitter.hraven + hraven + 0.9.0-SNAPSHOT + ../ + + com.twitter.hraven + hraven-etl + 0.9.0-SNAPSHOT + hRaven - etl + jar + ETL map reduce jobs and supporting components for data loading + + + + + + + maven-compiler-plugin + 2.1 + + ${compileSource} + ${compileSource} + true + false + -Xmx1024m + + + + maven-jar-plugin + 2.3 + + + maven-source-plugin + 2.1.1 + + + maven-surefire-plugin + 2.5 + + 3600 + -Xmx512m + true + + + + maven-javadoc-plugin + 2.6.1 + + + maven-clean-plugin + 2.4 + + + + + build + + + + + + maven-dependency-plugin + 2.4 + + + copy-dependencies + package + + copy-dependencies + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.2 + + + + test-jar + + + + + + + + + + src/test/resources/ + + log4j.properties + done/* + + + + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + package + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + always + + **/Test*.java + + + **/*$* + + + + + + maven-assembly-plugin + 2.3 + + true + + + + + + + + + com.twitter.hraven + hraven-core + 0.9.0-SNAPSHOT + + + + com.twitter.hraven + hraven-core + 0.9.0-SNAPSHOT + test-jar + test + + + + com.google.guava + guava + ${guava.version} + + + commons-logging + commons-logging + ${commons-logging.version} + + + commons-cli + commons-cli + 1.2 + + + + log4j + log4j + ${log4j.version} + + + javax.jms + jms + + + javax.mail + mail + + + com.sun.jmx + jmxri + + + com.sun.jdmk + jmxtools + + + + + + org.apache.hadoop + hadoop-core + ${hadoop.version} + provided + + + hsqldb + hsqldb + + + net.sf.kosmosfs + kfs + + + org.eclipse.jdt + core + + + net.java.dev.jets3t + jets3t + + + oro + oro + + + + + org.apache.hbase + hbase + ${hbase.version} + provided + + + org.apache.thrift + thrift + + + org.mortbay.jetty + jetty + + + org.mortbay.jetty + jetty-util + + + org.mortbay.jetty + jsp-2.1 + + + org.mortbay.jetty + jsp-api-2.1 + + + org.mortbay.jetty + servlet-api-2.5 + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + + + com.sun.jersey + jersey-servlet + ${jersey.version} + + + com.sun.jersey + jersey-json + ${jersey.version} + + + com.sun.jersey + jersey-server + ${jersey.version} + + + com.sun.jersey + jersey-core + ${jersey.version} + + + org.codehaus.jackson + jackson-core-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-mapper-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-jaxrs + ${jackson.version} + + + org.codehaus.jackson + jackson-xc + ${jackson.version} + + + + + org.apache.hadoop + hadoop-test + ${hadoop.version} + test + + + org.apache.hbase + hbase + ${hbase.version} + tests + test + + + org.apache.thrift + thrift + + + + + junit + junit + ${junit.version} + test + + + org.mockito + mockito-all + ${mockito-all.version} + test + + + + + + os.linux + + false + + Linux + + + + ${os.name}-${os.arch}-${sun.arch.data.model} + + + + os.mac + + + Mac + + + + Mac_OS_X-${sun.arch.data.model} + + + + + + + + maven-javadoc-plugin + 2.6.1 + + true + + + + default + + javadoc + + + + + + + diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/FileLister.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/FileLister.java new file mode 100644 index 0000000..a73fd30 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/FileLister.java @@ -0,0 +1,89 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Utility class that performs operations on hdfs files such as listing files recursively + * Used by JobFilePartitioner and JobFilePreprocessor + * + */ +public class FileLister { + + /** + * Default constructor. + */ + public FileLister() { + } + + /* + * Recursively traverses the dirs to get the list of + * files for a given path filtered as per the input path range filter + * + */ + private static void traverseDirs(List fileStatusesList, FileSystem hdfs, + Path inputPath, JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter) + throws IOException + { + // get all the files and dirs in the current dir + FileStatus allFiles[] = hdfs.listStatus(inputPath); + for (FileStatus aFile: allFiles) { + if (aFile.isDir()) { + //recurse here + traverseDirs(fileStatusesList, hdfs, aFile.getPath(), jobFileModifiedRangePathFilter); + } + else { + // check if the pathFilter is accepted for this file + if (jobFileModifiedRangePathFilter.accept(aFile.getPath())) { + fileStatusesList.add(aFile); + } + } + } + } + + /* + * Gets the list of files for a given path filtered as per the input path range filter + * Can go into directories recursively + * + * @param recurse - whether or not to traverse recursively + * @param hdfs - the file system + * @param inputPath - the path to traverse for getting the list of files + * @param jobFileModifiedRangePathFilter - the filter to include/exclude certain files + * + * @return array of file status. + * @throws IOException + */ + public static FileStatus[] listFiles (boolean recurse, FileSystem hdfs, Path inputPath, + JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter) throws IOException + { + if (recurse) { + List fileStatusesList = new ArrayList(); + traverseDirs(fileStatusesList, hdfs, inputPath, jobFileModifiedRangePathFilter); + FileStatus[] fileStatuses = (FileStatus[]) fileStatusesList.toArray( + new FileStatus[fileStatusesList.size()]); + return fileStatuses; + } + else { + return hdfs.listStatus(inputPath, jobFileModifiedRangePathFilter); + } + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/FileStatusModificationComparator.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/FileStatusModificationComparator.java new file mode 100644 index 0000000..736719b --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/FileStatusModificationComparator.java @@ -0,0 +1,55 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.util.Comparator; + +import org.apache.hadoop.fs.FileStatus; + +public class FileStatusModificationComparator implements Comparator { + + /** + * Default constructor. + */ + public FileStatusModificationComparator() { + } + + /* + * (non-Javadoc) + * + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + public int compare(FileStatus fileStatus1, FileStatus fileStatus2) { + + // Do the obligatory null checks. + if ((fileStatus1 == null) && (fileStatus2 == null)) { + return 0; + } + if (fileStatus1 == null) { + return -1; + } + if (fileStatus2 == null) { + return 1; + } + + long modificationTime1 = fileStatus1.getModificationTime(); + long modificationTime2 = fileStatus2.getModificationTime(); + + return (modificationTime1 < modificationTime2 ? -1 + : (modificationTime1 == modificationTime2 ? 0 : 1)); + }; + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ImportException.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ImportException.java new file mode 100644 index 0000000..0d22f3b --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ImportException.java @@ -0,0 +1,31 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +/** + */ +public class ImportException extends RuntimeException { + + private static final long serialVersionUID = 2312684791991178660L; + + public ImportException(String message) { + super(message); + } + + public ImportException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFile.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFile.java new file mode 100644 index 0000000..8497605 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFile.java @@ -0,0 +1,155 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import com.twitter.hraven.Constants; + +/** + * Class that handles Job files, whether confs files or history files. + * + */ +public class JobFile implements Writable { + + private static final Pattern PATTERN = Pattern + .compile(Constants.JOB_FILENAME_PATTERN_REGEX); + + private String filename; + private String jobTracker = null; + private String jobid = null; + private boolean isJobConfFile = false; + private boolean isJobHistoryFile = false; + + /** + * Default constructor. Used by reflection utils in combination with {@link #readFields(DataInput)} + */ + public JobFile() { + + } + + /** + * Constructor + * + * @param name + * of the file (that may or may not be a job file). + */ + public JobFile(String filename) { + if (null == filename) { + this.filename = ""; + } else { + this.filename = filename; + } + parseFilename(); + } + + /** + * Parse the filename and pull the jobtracker and jobid out of it. + */ + private void parseFilename() { + + // Add additional filtering to discard empty files, or files ending in .crc + if ((filename != null) && (filename.length() > 0) + && (!filename.endsWith(".crc"))) { + + Matcher matcher = PATTERN.matcher(filename); + + if (matcher.matches()) { + jobTracker = matcher.group(1); + jobid = matcher.group(2); + String remainder = matcher.group(3); + + if (Constants.JOB_CONF_FILE_END.equals(remainder)) { + isJobConfFile = true; + } else { + isJobHistoryFile = true; + } + } + } + } + + /** + * @return the name of this file. + */ + public String getFilename() { + return filename; + } + + /** + * @return the job ID for this job as parsed through the filename or null if + * this is not a valid job file. + */ + public String getJobid() { + return jobid; + } + + /** + * @return the jobtracker part or null if this is not a valid job file. + */ + public String getJobTracker() { + return jobTracker; + } + + /** + * @return whether this file is a JobConfFile + */ + public boolean isJobConfFile() { + return isJobConfFile; + } + + /** + * @return whether this file is a JobHistoryFile + */ + public boolean isJobHistoryFile() { + return isJobHistoryFile; + } + + /* + * (non-Javadoc) + * + * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) + */ + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, filename); + Text.writeString(out, jobTracker); + Text.writeString(out, jobid); + out.writeBoolean(isJobConfFile); + out.writeBoolean(isJobHistoryFile); + } + + /* + * (non-Javadoc) + * + * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) + */ + @Override + public void readFields(DataInput in) throws IOException { + this.filename = Text.readString(in); + this.jobTracker = Text.readString(in); + this.jobid = Text.readString(in); + this.isJobConfFile = in.readBoolean(); + this.isJobHistoryFile = in.readBoolean(); + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileModifiedRangePathFilter.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileModifiedRangePathFilter.java new file mode 100644 index 0000000..7fc39b7 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileModifiedRangePathFilter.java @@ -0,0 +1,138 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Pathfilter that allows only files that are named correctly and are modified + * within a certain time range. + * + */ +public class JobFileModifiedRangePathFilter extends JobFilePathFilter { + + /** + * The minimum modification time of a file to be accepted in milliseconds + * since January 1, 1970 UTC (excluding). + */ + private final long minModificationTimeMillis; + + /** + * The maximum modification time of a file to be accepted in milliseconds + * since January 1, 1970 UTC (including). + */ + private final long maxModificationTimeMillis; + + /** + * The configuration of this processing job (not the files we are processing). + */ + private final Configuration myConf; + + /** + * Constructs a filter that accepts only JobFiles with lastModification time + * in the specified range. + * + * @param myConf + * used to be able to go from a path to a FileStatus. + * @param minModificationTimeMillis + * The minimum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (excluding). + * @param maxModificationTimeMillis The + * maximum modification time of a file to be accepted in milliseconds + * since January 1, 1970 UTC (including). + */ + public JobFileModifiedRangePathFilter(Configuration myConf, + long minModificationTimeMillis, long maxModificationTimeMillis) { + this.myConf = myConf; + this.minModificationTimeMillis = minModificationTimeMillis; + this.maxModificationTimeMillis = maxModificationTimeMillis; + } + + /** + * Constructs a filter that accepts only JobFiles with lastModification time + * as least the specified minumum. + * + * @param myConf + * used to be able to go from a path to a FileStatus. + * @param minModificationTimeMillis + * The minimum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (excluding). + */ + public JobFileModifiedRangePathFilter(Configuration myConf, + long minModificationTimeMillis) { + this(myConf, minModificationTimeMillis, Long.MAX_VALUE); + } + + /* + * (non-Javadoc) + * + * @see + * com.twitter.hraven.etl.JobFilePathFilter#accept(org.apache + * .hadoop.fs.Path) + */ + @Override + public boolean accept(Path path) { + if (!super.accept(path)) { + return false; + } + + JobFile jobFile = new JobFile(path.getName()); + if (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()) { + try { + FileSystem fs = path.getFileSystem(myConf); + FileStatus fileStatus = fs.getFileStatus(path); + long fileModificationTimeMillis = fileStatus.getModificationTime(); + + return accept(fileModificationTimeMillis); + } catch (IOException e) { + throw new ImportException("Cannot determine file modification time of " + + path.getName(), e); + } + } else { + // Reject anything that does not match a job conf filename. + return false; + } + } + + /** + * @param fileModificationTimeMillis + * in milliseconds since January 1, 1970 UTC + * @return whether a file with such modification time is to be accepted. + */ + public boolean accept(long fileModificationTimeMillis) { + return ((minModificationTimeMillis < fileModificationTimeMillis) && (fileModificationTimeMillis <= maxModificationTimeMillis)); + } + + /** + * @return the minModificationTimeMillis used in for this filter. + */ + public long getMinModificationTimeMillis() { + return minModificationTimeMillis; + } + + /** + * @return the maxModificationTimeMillis used for this filter + */ + public long getMaxModificationTimeMillis() { + return maxModificationTimeMillis; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePartitioner.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePartitioner.java new file mode 100644 index 0000000..f93d7d8 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePartitioner.java @@ -0,0 +1,541 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.TimeZone; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileSystem.Statistics; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import com.twitter.hraven.util.BatchUtil; + +/** + * Command line tool to take a directory and split all the job confs into a + * hierarchical directory structure in HDFS by date (of the file). Can load from + * HDFS history done directory, or from a local filesystem (identified as + * file://) + *

+ * A limit can be specified how many files should be left in the source + * directory. That is handy for continuously running on the + * /hadoop/mapred/history/done directory to partition all files, but to keep the + * growth of the done directory in check. + */ +public class JobFilePartitioner extends Configured implements Tool { + + final static String NAME = JobFilePartitioner.class.getSimpleName(); + private static Log LOG = LogFactory.getLog(JobFilePartitioner.class); + + // Simple format look like this: yyyy-MM-dd HH:mm + public static final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat( + "yyyy"); + public static final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("MM"); + public static final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("dd"); + + // Initialize to use UTC + static { + TimeZone utc = TimeZone.getTimeZone("UTC"); + YEAR_FORMAT.setTimeZone(utc); + MONTH_FORMAT.setTimeZone(utc); + DAY_FORMAT.setTimeZone(utc); + } + + /** + * To be used for this job. + */ + Configuration myConf; + + /** + * Input directory (presumed in HDFS, unless prefixed with file:// + */ + String input; + + /** + * Whether files that are already in the target directory should be skipped. + */ + boolean skipExisting = true; + + /** + * Whether files should be moved rather than copied. Can be used with HDFS + * input paths only. + */ + boolean moveFiles = true; + + /** + * The maximum number of files to retain in the input directory after + * processing. + */ + int maXretention = Integer.MAX_VALUE; + + /** + * Used to read files from (if input is HDFS) and to write output to. + */ + FileSystem hdfs; + + /** + * Location in HDFS where to write the output to. Under this directory a + * year/month/day directory structure will be created. + */ + Path outputPath; + + /** + * Parse command-line arguments. + * + * @param args + * command line arguments passed to program. + * @return parsed command line. + * @throws ParseException + */ + private static CommandLine parseArgs(String[] args) throws ParseException { + Options options = new Options(); + + // Input + Option o = new Option("i", "input", true, + "input directory as hdfs path, or local as file://"); + o.setArgName("input-path"); + o.setRequired(true); + options.addOption(o); + + // Input + o = new Option("o", "output", true, "output directory"); + o.setArgName("input-path"); + o.setRequired(true); + options.addOption(o); + + // Whether to skip existing files or not. + o = new Option("s", "skipExisting", false, + "skip existing files. Cannot be used together with m for move."); + o.setRequired(false); + options.addOption(o); + + // Maximum number of files to retain in the specified input directory. + o = new Option( + "x", + "maXretention", + true, + "The maximum number of the most recent files to retain in the input directory after processing." + + " Can be used by HDFS input paths only. Mutually exclusive with s (move)," + + " but can be used in combination with s (skipExisting)"); + o.setRequired(false); + options.addOption(o); + + // Whether files need to be moved + o = new Option("m", "move", false, "move all files rather than copying." + + "Delete source if target already exists." + + " Can be used with HDFS input paths only. " + + " Mutually exlusive with s (skipExisting)"); + o.setRequired(false); + options.addOption(o); + + // Debugging + options.addOption("d", "debug", false, "switch on DEBUG log level"); + o.setRequired(false); + options.addOption(o); + + CommandLineParser parser = new PosixParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + System.err.println("ERROR: " + e.getMessage() + "\n"); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(NAME + " ", options, true); + System.exit(-1); + } + + return commandLine; + } + + /* + * Do the actual work. + * + * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) + */ + @Override + public int run(String[] args) throws Exception { + + myConf = getConf(); + + // Presume this is all HDFS paths, even when access as file:// + hdfs = FileSystem.get(myConf); + + // Grab input args and allow for -Dxyz style arguments + String[] otherArgs = new GenericOptionsParser(myConf, args) + .getRemainingArgs(); + + // Grab the arguments we're looking for. + CommandLine commandLine = parseArgs(otherArgs); + + // Grab the input path argument + input = commandLine.getOptionValue("i"); + LOG.info("input=" + input); + + // Grab the input path argument + String output = commandLine.getOptionValue("o"); + LOG.info("output=" + output); + + skipExisting = commandLine.hasOption("s"); + LOG.info("skipExisting=" + skipExisting); + + moveFiles = commandLine.hasOption("m"); + LOG.info("moveFiles=" + moveFiles); + + if (skipExisting && moveFiles) { + throw new IllegalArgumentException( + "Cannot use both options skipExisting and move simultaneously."); + } + + if (commandLine.hasOption("x")) { + try { + maXretention = Integer.parseInt(commandLine.getOptionValue("x")); + } catch (NumberFormatException nfe) { + throw new IllegalArgumentException( + "maXretention option -x is is not a valid number: " + + commandLine.getOptionValue("x"), nfe); + } + // Additional check + if (maXretention < 0) { + throw new IllegalArgumentException( + "Cannot retain less than 0 files. Specified maXretention option -x is: " + + commandLine.getOptionValue("x")); + } + LOG.info("maXretention=" + maXretention); + if (moveFiles) { + throw new IllegalArgumentException( + "Cannot use both options maXretention and move simultaneously."); + } + } else { + maXretention = Integer.MAX_VALUE; + } + + outputPath = new Path(output); + FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); + + if (!outputFileStatus.isDir()) { + throw new IOException("Output is not a directory" + + outputFileStatus.getPath().getName()); + } + + Path inputPath = new Path(input); + URI inputURI = inputPath.toUri(); + String inputScheme = inputURI.getScheme(); + + LOG.info("input scheme is: " + inputScheme); + + // If input directory is HDFS, then process as such. Assume not scheme is + // HDFS + if ((inputScheme == null) + || (hdfs.getUri().getScheme().equals(inputScheme))) { + processHDFSSources(inputPath); + } else if (inputScheme.equals("file")) { + if (moveFiles) { + throw new IllegalArgumentException( + "Cannot move files that are not already in hdfs. Input is not HDFS: " + + input); + } + processPlainFileSources(inputURI); + } else { + throw new IllegalArgumentException( + "Cannot process files from this URI scheme: " + inputScheme); + } + + Statistics statistics = FileSystem.getStatistics(outputPath.toUri() + .getScheme(), hdfs.getClass()); + if (statistics != null) { + LOG.info("HDFS bytes read: " + statistics.getBytesRead()); + LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); + LOG.info("HDFS read ops: " + statistics.getReadOps()); + System.out + .println("HDFS large read ops: " + statistics.getLargeReadOps()); + LOG.info("HDFS write ops: " + statistics.getWriteOps()); + } + + return 0; + } + + /** + * Process HDFS source directory. + * + * @param hdfs + * @param outputPath + * @param inputPath + * @throws IOException + */ + /** + * @param inputPath + * @throws IOException + */ + /** + * @param inputPath + * @throws IOException + */ + private void processHDFSSources(Path inputPath) throws IOException { + // Try to get the fileStatus only if we're reasonably confident that this + // is an HDFS path.s + FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); + + // Check if input is a directory + if (!inputFileStatus.isDir()) { + throw new IOException("Input is not a directory in HDFS: " + input); + } + + // Accept only jobFiles and only those that fall in the desired range of + // modification time. + JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( + myConf, 0L); + + ContentSummary contentSummary = hdfs.getContentSummary(inputPath); + LOG.info("Listing / filtering (" + contentSummary.getFileCount() + + ") files in: " + inputPath); + + // get the files in the done folder, + // need to traverse dirs under done recursively for versions + // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 + // on/after cdh3u5 + FileStatus[] jobFileStatusses = FileLister.listFiles(true, hdfs, inputPath, + jobFileModifiedRangePathFilter); + + LOG.info("Sorting " + jobFileStatusses.length + " job files."); + + Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); + + int processedCount = 0; + try { + + for (int i = 0; i < jobFileStatusses.length; i++) { + FileStatus jobFileStatus = jobFileStatusses[i]; + + boolean retain = BatchUtil.shouldRetain(i, maXretention, jobFileStatusses.length); + processHDFSSource(hdfs, jobFileStatus, outputPath, myConf, + skipExisting, retain); + processedCount++; + // Print something each 1k files to show progress. + if ((i % 1000) == 0) { + LOG.info("Processed " + i + " files."); + } + + } + + } finally { + LOG.info("Processed " + processedCount + " files."); + } + } + + + + /** + * Input is a regular directory (non-hdfs). Process files accordingly. + * + * @param inputURI + * @throws IOException + */ + private void processPlainFileSources(URI inputURI) throws IOException { + LOG.info("Scheme specific part is: " + inputURI.getSchemeSpecificPart()); + + File inputFile = new File(inputURI.getSchemeSpecificPart()); + + // Check if input is a directory + if (!inputFile.isDirectory()) { + throw new IOException("Input is not a regular directory: " + input); + } + + File[] files = inputFile.listFiles(); + int processedCount = 0; + try { + for (File f : files) { + // Process only files, not (sub)directories. + if (f.isFile()) { + processPlainFile(hdfs, f, outputPath, skipExisting); + processedCount++; + // Print something each 100 files to show progress. + if ((processedCount % 1000) == 0) { + LOG.info("Processed " + processedCount + " files."); + } + } + } + } finally { + LOG.info("Processed " + processedCount + " files."); + } + } + + /** + * @param hdfs + * FileSystem handle + * @param outputPath + * base directory where files to be written to + * @param fileModTime + * of the file that needs to be moved/copied to hdfs + * @return the existing path in HDFS to write to the file to. Will be created + * if it does not exist. + * @throws IOException + * if the year/month/day directory with cannot be created in + * outputPath. + */ + private Path getTargetDirectory(FileSystem hdfs, Path outputPath, + long fileModTime) throws IOException { + String year = YEAR_FORMAT.format(new Date(fileModTime)); + String month = MONTH_FORMAT.format(new Date(fileModTime)); + String day = DAY_FORMAT.format(new Date(fileModTime)); + + Path yearDir = new Path(outputPath, year); + Path monthDir = new Path(yearDir, month); + Path dayDir = new Path(monthDir, day); + + // Check if the directory already exists, if not, then insert a record into + // HBase for it. + if (!hdfs.exists(dayDir)) { + if (hdfs.mkdirs(dayDir)) { + LOG.info("Created: " + dayDir.toString()); + } else { + throw new IOException("Unable to create target directory with date: " + + dayDir.getName()); + } + } + return dayDir; + } + + /** + * @param hdfs + * FileSystem handle + * @param f + * file to copy to HDFS + * @param outputPath + * @param skipExisting + * skip if the file already exist in the target. File will be + * overwritten if already there and this argument is false. + * @throws IOException + * if target directory cannot be created or file cannot be copied to + * target directory. + */ + private void processPlainFile(FileSystem hdfs, File f, Path outputPath, + boolean skipExisting) throws IOException { + long fileModTime = f.lastModified(); + Path targetDir = getTargetDirectory(hdfs, outputPath, fileModTime); + + boolean doCopy = true; + Path sourceFile = new Path(f.getPath()); + if (skipExisting) { + Path target = new Path(targetDir, sourceFile.getName()); + if (hdfs.exists(target)) { + doCopy = false; + } + } + if (doCopy) { + hdfs.copyFromLocalFile(sourceFile, targetDir); + } + + } + + /** + * @param hdfs + * FileSystem handle + * @param f + * file to process + * @param outputPath + * @param conf + * configuration to use for copying. + * @param skipExisting + * skip if the file already exist in the target. File will be + * overwritten if already there and this argument is false. + * @retain whether this file should be retained + * + * @throws IOException + */ + private void processHDFSSource(FileSystem hdfs, FileStatus f, + Path outputPath, Configuration conf, boolean skipExisting, boolean retain) + throws IOException { + + long fileModTime = f.getModificationTime(); + Path targetDir = getTargetDirectory(hdfs, outputPath, fileModTime); + + boolean targetExists = false; + Path target = new Path(targetDir, f.getPath().getName()); + targetExists = hdfs.exists(target); + + if (moveFiles || !retain) { + if (targetExists) { + hdfs.delete(f.getPath(), false); + } else { + hdfs.rename(f.getPath(), targetDir); + } + } else { + if (targetExists && skipExisting) { + // Do nothing, target is already there and we're instructed to skip + // existing records. + } else { + copy(hdfs, f, conf, targetDir); + } + } + } + + /** + * @param hdfs FileSystem handle + * @param f to copy + * @param conf configuration to use for copying. + * @param targetDir directory to copy said file to. + * @throws IOException + */ + private void copy(FileSystem hdfs, FileStatus f, Configuration conf, + Path targetDir) throws IOException { + long startNanos = System.nanoTime(); + FileUtil.copy(hdfs, f.getPath(), hdfs, targetDir, false, true, conf); + long estimatedTimeNanos = System.nanoTime() - startNanos; + // Nanos are 10^-9, millis 10^-3 + long durationMillis = estimatedTimeNanos / 1000000; + if (durationMillis > 3000) { + String msg = "It took " + durationMillis / 1000 + " seconds to copy " + + f.getPath().getName() + " of " + f.getLen() + " bytes."; + LOG.warn(msg); + } + } + + /** + * DoIt. + * + * @param args + * the arguments to do it with + */ + public static void main(String[] args) { + try { + ToolRunner.run(new JobFilePartitioner(), args); + } catch (Exception e) { + LOG.error("Problem running: " + NAME, e); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePathFilter.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePathFilter.java new file mode 100644 index 0000000..b279c64 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePathFilter.java @@ -0,0 +1,49 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; + +/** + * {@link PathFilter} that accepts only job conf or job history files. + * + */ +public class JobFilePathFilter implements PathFilter { + + /** + * Default constructor. + */ + public JobFilePathFilter() { + } + + /* + * Accept only those paths that are either job confs or job history files. + * + * @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path) + */ + @Override + public boolean accept(Path path) { + // Ideally we want to do this + // JobFile jobFile = new JobFile(path.getName()); + // return (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()); + // Aside from that not being efficient, it also chokes on input directories. + + // therefore, allow anythying but CRC files. The record reader will have to deal with the rest. + return !((path == null) || (path.getName().endsWith(".crc"))); + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePreprocessor.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePreprocessor.java new file mode 100644 index 0000000..b8991eb --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFilePreprocessor.java @@ -0,0 +1,428 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Date; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileSystem.Statistics; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.io.SequenceFile.Writer; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.etl.ProcessRecordService; +import com.twitter.hraven.util.BatchUtil; +import com.twitter.hraven.etl.FileLister; +import com.twitter.hraven.etl.JobFileModifiedRangePathFilter; + +/** + * Command line tool that can be run on a periodic basis (like daily, hourly, or + * every 15 minutes or 1/2 hour). Each run is recorded by inserting a new + * {@link ProcessRecord} in {@link ProcessState#CREATED} state. When the total + * processing completes successfully, then the record state will be updated to + * {@link ProcessState#PREPROCESSED} to indicate that this batch has been + * successfully updated. The run start time will be recorded in as + * {@link ProcessRecord#getMaxModificationTimeMillis()} so it can be used as the + * starting mark for the next run if the previous run is successful. + * + * Given the sloooow copying of 100k little files in Hadoop (pull from HDFS, + * push back in) we need to run this as multiple mappers. - Pull the last + * process date from HBase. - Insert a new record into HBase with the last date + * as the start and the current date as the end. - Create a map-reduce job that + * reads whole files, combine, and set a min to have multiple maps. - Then copy + * files and emit the smallest job_id as a key and a timestamp as a value - Then + * have a combiner that combines keys/values - then pick up the result from the + * smallest number - Then update record in HBase with the processing date to + * mark that processing finished (or not). + * + */ +public class JobFilePreprocessor extends Configured implements Tool { + + public final static String NAME = JobFilePreprocessor.class.getSimpleName(); + private static Log LOG = LogFactory.getLog(JobFilePreprocessor.class); + + /** + * Maximum number of files to process in one batch. + */ + private final static int DEFAULT_BATCH_SIZE = 1000; + + /** + * Name of the job conf property used to pass the output directory to the + * mappers. + */ + public final static String JOB_RECORD_KEY_LABEL = NAME + ".job.record.key"; + + /** + * Default constructor. + */ + public JobFilePreprocessor() { + } + + /** + * Used for injecting confs while unit testing + * + * @param conf + */ + public JobFilePreprocessor(Configuration conf) { + super(conf); + } + + /** + * Parse command-line arguments. + * + * @param args + * command line arguments passed to program. + * @return parsed command line. + * @throws ParseException + */ + private static CommandLine parseArgs(String[] args) throws ParseException { + Options options = new Options(); + + // Cluster + Option o = new Option("c", "cluster", true, + "cluster for which jobs are processed"); + o.setArgName("cluster"); + o.setRequired(true); + options.addOption(o); + + // Input + o = new Option("o", "output", true, + "output directory in hdfs. This is where the index files are written."); + o.setArgName("output-path"); + o.setRequired(true); + options.addOption(o); + + // Input + o = new Option( + "i", + "input", + true, + "input directory in hdfs. Default is mapred.job.tracker.history.completed.location."); + o.setArgName("input-path"); + o.setRequired(false); + options.addOption(o); + + // Batch + o = new Option("b", "batchSize", true, + "The number of files to process in one batch. Default " + + DEFAULT_BATCH_SIZE); + o.setArgName("batch-size"); + o.setRequired(false); + options.addOption(o); + + // Force + o = new Option( + "f", + "forceAllFiles", + false, + "Force all files in a directory to be processed, no matter the previous processingRecord. Default: false. Usefull for batch loads."); + o.setRequired(false); + options.addOption(o); + + // Debugging + options.addOption("d", "debug", false, "switch on DEBUG log level"); + + CommandLineParser parser = new PosixParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + System.err.println("ERROR: " + e.getMessage() + "\n"); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(NAME + " ", options, true); + System.exit(-1); + } + + // Set debug level right away + if (commandLine.hasOption("d")) { + Logger log = Logger.getLogger(JobFileRawLoader.class); + log.setLevel(Level.DEBUG); + } + + return commandLine; + } + + /* + * Do the actual work. + * + * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) + */ + @Override + public int run(String[] args) throws Exception { + + // When we started processing. This is also the upper limit of files we + // accept, next run will pick up the new incoming files. + long processingStartMillis = System.currentTimeMillis(); + + Configuration hbaseConf = HBaseConfiguration.create(getConf()); + + // Grab input args and allow for -Dxyz style arguments + String[] otherArgs = new GenericOptionsParser(hbaseConf, args) + .getRemainingArgs(); + + // Grab the arguments we're looking for. + CommandLine commandLine = parseArgs(otherArgs); + + // Output should be an hdfs path. + FileSystem hdfs = FileSystem.get(hbaseConf); + + // Grab the input path argument + String output = commandLine.getOptionValue("o"); + LOG.info("output=" + output); + Path outputPath = new Path(output); + FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); + + if (!outputFileStatus.isDir()) { + throw new IOException("Output is not a directory" + + outputFileStatus.getPath().getName()); + } + + // Grab the input path argument + String input; + if (commandLine.hasOption("i")) { + input = commandLine.getOptionValue("i"); + } else { + input = hbaseConf.get("mapred.job.tracker.history.completed.location"); + } + LOG.info("input=" + input); + + // Grab the batch-size argument + int batchSize; + if (commandLine.hasOption("b")) { + try { + batchSize = Integer.parseInt(commandLine.getOptionValue("b")); + } catch (NumberFormatException nfe) { + throw new IllegalArgumentException( + "batch size option -b is is not a valid number: " + + commandLine.getOptionValue("b"), nfe); + } + // Additional check + if (batchSize < 1) { + throw new IllegalArgumentException( + "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + + commandLine.getOptionValue("b")); + } + } else { + batchSize = DEFAULT_BATCH_SIZE; + } + + boolean forceAllFiles = commandLine.hasOption("f"); + LOG.info("forceAllFiles: " + forceAllFiles); + + Path inputPath = new Path(input); + FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); + + if (!inputFileStatus.isDir()) { + throw new IOException("Input is not a directory" + + inputFileStatus.getPath().getName()); + } + + // Grab the cluster argument + String cluster = commandLine.getOptionValue("c"); + LOG.info("cluster=" + cluster); + + ProcessRecordService processRecordService = new ProcessRecordService( + hbaseConf); + + boolean success = true; + try { + + // Figure out where we last left off (if anywhere at all) + ProcessRecord lastProcessRecord = null; + + if (!forceAllFiles) { + lastProcessRecord = processRecordService + .getLastSuccessfulProcessRecord(cluster); + } + + long minModificationTimeMillis = 0; + if (lastProcessRecord != null) { + // Start of this time period is the end of the last period. + minModificationTimeMillis = lastProcessRecord + .getMaxModificationTimeMillis(); + } + + // Do a sanity check. The end time of the last scan better not be later + // than when we started processing. + if (minModificationTimeMillis > processingStartMillis) { + throw new RuntimeException( + "The last processing record has maxModificationMillis later than now: " + + lastProcessRecord); + } + + // Accept only jobFiles and only those that fall in the desired range of + // modification time. + JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( + hbaseConf, minModificationTimeMillis); + + String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date( + minModificationTimeMillis)); + + ContentSummary contentSummary = hdfs.getContentSummary(inputPath); + LOG.info("Listing / filtering (" + + contentSummary.getFileCount() + ") files in: " + inputPath + + " that are modified since " + timestamp); + + // get the files in the done folder, + // need to traverse dirs under done recursively for versions + // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 + // on/after cdh3u5 + FileStatus[] jobFileStatusses = FileLister.listFiles(true, hdfs, inputPath, + jobFileModifiedRangePathFilter); + + LOG.info("Sorting " + jobFileStatusses.length + " job files."); + + Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); + + // Process these files in batches at a time. + int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize); + LOG.info("Batch count: " + batchCount); + for (int b = 0; b < batchCount; b++) { + processBatch(jobFileStatusses, b, batchSize, processRecordService, + cluster, outputPath); + } + + } finally { + processRecordService.close(); + } + + Statistics statistics = FileSystem.getStatistics(inputPath.toUri() + .getScheme(), hdfs.getClass()); + if (statistics != null) { + LOG.info("HDFS bytes read: " + statistics.getBytesRead()); + LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); + LOG.info("HDFS read ops: " + statistics.getReadOps()); + LOG.info("HDFS large read ops: " + statistics.getLargeReadOps()); + LOG.info("HDFS write ops: " + statistics.getWriteOps()); + } + + // Return the status + return success ? 0 : 1; + } + + + + /** + * @param jobFileStatusses + * statusses sorted by modification time. + * @param batch + * which batch needs to be processed (used to calculate offset in + * jobFileStatusses. + * @param batchSize + * process up to length items (or less as to not exceed the length of + * jobFileStatusses + * @param processRecordService + * to be used to access create ProcessRecords. + * @throws IOException + * when the index file cannot be written or moved, or when the HBase + * records cannot be created. + */ + private void processBatch(FileStatus jobFileStatusses[], int batch, + int batchSize, ProcessRecordService processRecordService, String cluster, + Path outputPath) throws IOException { + + int startIndex = batch * batchSize; + + LOG.info("Batch startIndex: " + startIndex + " batchSize: " + + batchSize); + + // Some protection against over and under runs. + if ((jobFileStatusses == null) || (startIndex < 0) + || (startIndex >= jobFileStatusses.length)) { + return; + } + + MinMaxJobFileTracker minMaxJobFileTracker = new MinMaxJobFileTracker(); + + Path initialProcesFile = processRecordService.getInitialProcessFile( + cluster, batch); + Writer processFileWriter = processRecordService + .createProcessFileWriter(initialProcesFile); + + // Make sure we don't run off the end of the array + int endIndexExclusive = Math.min((startIndex + batchSize), + jobFileStatusses.length); + try { + for (int i = startIndex; i < endIndexExclusive; i++) { + FileStatus fileStatus = jobFileStatusses[i]; + JobFile jobFile = minMaxJobFileTracker.track(fileStatus); + + // String jobfileName = fileStatus.getPath().getName(); + // LOG.info(jobfileName + " modified: " + // + fileStatus.getModificationTime()); + + processFileWriter.append(jobFile, fileStatus); + } + + } finally { + processFileWriter.close(); + } + + Path processFile = processRecordService.moveProcessFile(initialProcesFile, + outputPath); + + int processedJobFiles = endIndexExclusive - startIndex; + + ProcessRecord processRecord = new ProcessRecord(cluster, + ProcessState.PREPROCESSED, + minMaxJobFileTracker.getMinModificationTimeMillis(), + minMaxJobFileTracker.getMaxModificationTimeMillis(), processedJobFiles, + processFile.toString(), minMaxJobFileTracker.getMinJobId(), + minMaxJobFileTracker.getMaxJobId()); + + LOG.info("Creating processRecord: " + processRecord); + + processRecordService.writeJobRecord(processRecord); + + } + + /** + * DoIt. + * + * @param args + * the arguments to do it with + */ + public static void main(String[] args) { + try { + ToolRunner.run(new JobFilePreprocessor(), args); + } catch (Exception e) { + e.printStackTrace(); + LOG.error("Error running job.", e); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileProcessor.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileProcessor.java new file mode 100644 index 0000000..3838d89 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileProcessor.java @@ -0,0 +1,612 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static com.twitter.hraven.etl.ProcessState.LOADED; +import static com.twitter.hraven.etl.ProcessState.PROCESSED; + +import java.io.IOException; +import java.util.Date; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.mapreduce.MultiTableOutputFormat; +import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.datasource.JobHistoryRawService; +import com.twitter.hraven.etl.ProcessRecordService; +import com.twitter.hraven.datasource.RowKeyParseException; +import com.twitter.hraven.mapreduce.JobFileTableMapper; + +/** + * Used to process one ProcessingRecord at at time. For each record an HBase job + * is created to scan the corresponding rows in the raw + * + */ +public class JobFileProcessor extends Configured implements Tool { + + final static String NAME = JobFileProcessor.class.getSimpleName(); + private static Log LOG = LogFactory.getLog(JobFileProcessor.class); + + private final String startTimestamp = Constants.TIMESTAMP_FORMAT + .format(new Date(System.currentTimeMillis())); + + private final AtomicInteger jobCounter = new AtomicInteger(0); + + /** + * Maximum number of files to process in one batch. + */ + private final static int DEFAULT_BATCH_SIZE = 100; + + /** + * Default constructor. + */ + public JobFileProcessor() { + + } + + /** + * Used for injecting confs while unit testing + * + * @param conf + */ + public JobFileProcessor(Configuration conf) { + super(conf); + + } + + /** + * Parse command-line arguments. + * + * @param args + * command line arguments passed to program. + * @return parsed command line. + * @throws ParseException + */ + private static CommandLine parseArgs(String[] args) throws ParseException { + Options options = new Options(); + + // Input + Option o = new Option("c", "cluster", true, + "cluster for which jobs are processed"); + o.setArgName("cluster"); + o.setRequired(true); + options.addOption(o); + + // Whether to skip existing files or not. + o = new Option( + "r", + "reprocess", + false, + "Reprocess only those records that have been marked to be reprocessed. Otherwise process all rows indicated in the processing records, but successfully processed job files are skipped."); + o.setRequired(false); + options.addOption(o); + + // Batch + o = new Option("b", "batchSize", true, + "The number of files to process in one batch. Default " + + DEFAULT_BATCH_SIZE); + o.setArgName("batch-size"); + o.setRequired(false); + options.addOption(o); + + o = new Option( + "t", + "threads", + true, + "Number of parallel threads to use to run Hadoop jobs simultaniously. Default = 1"); + o.setArgName("thread-count"); + o.setRequired(false); + options.addOption(o); + + o = new Option( + "p", + "processFileSubstring", + true, + "use only those process records where the process file path contains the provided string. Useful when processing production jobs in parallel to historic loads."); + o.setArgName("processFileSubstring"); + o.setRequired(false); + options.addOption(o); + + // Debugging + options.addOption("d", "debug", false, "switch on DEBUG log level"); + + CommandLineParser parser = new PosixParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + System.err.println("ERROR: " + e.getMessage() + "\n"); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(NAME + " ", options, true); + System.exit(-1); + } + + // Set debug level right away + if (commandLine.hasOption("d")) { + Logger log = Logger.getLogger(JobFileProcessor.class); + log.setLevel(Level.DEBUG); + } + + return commandLine; + } + + /* + * (non-Javadoc) + * + * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) + */ + public int run(String[] args) throws Exception { + + Configuration hbaseConf = HBaseConfiguration.create(getConf()); + + // Grab input args and allow for -Dxyz style arguments + String[] otherArgs = new GenericOptionsParser(hbaseConf, args) + .getRemainingArgs(); + + // Grab the arguments we're looking for. + CommandLine commandLine = parseArgs(otherArgs); + + // Grab the cluster argument + String cluster = commandLine.getOptionValue("c"); + LOG.info("cluster=" + cluster); + + // Number of parallel threads to use + int threadCount = 1; + if (commandLine.hasOption("t")) { + try { + threadCount = Integer.parseInt(commandLine.getOptionValue("t")); + } catch (NumberFormatException nfe) { + throw new IllegalArgumentException( + "Provided thread-count argument (-t) is not a number: " + + commandLine.getOptionValue("t"), nfe); + } + if (threadCount < 1) { + throw new IllegalArgumentException( + "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + + threadCount); + } + } + LOG.info("threadCount=" + threadCount); + + boolean reprocess = commandLine.hasOption("r"); + LOG.info("reprocess=" + reprocess); + + // Grab the batch-size argument + int batchSize; + if (commandLine.hasOption("b")) { + try { + batchSize = Integer.parseInt(commandLine.getOptionValue("b")); + } catch (NumberFormatException nfe) { + throw new IllegalArgumentException( + "batch size option -b is is not a valid number: " + + commandLine.getOptionValue("b"), nfe); + } + // Additional check + if (batchSize < 1) { + throw new IllegalArgumentException( + "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + + commandLine.getOptionValue("b")); + } + } else { + batchSize = DEFAULT_BATCH_SIZE; + } + + String processFileSubstring = null; + if (commandLine.hasOption("p")) { + processFileSubstring = commandLine.getOptionValue("p"); + } + LOG.info("processFileSubstring=" + processFileSubstring); + + // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have + // history files exceeding that. Disable limit. + hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0); + + // Shove this into the jobConf so that we can get it out on the task side. + hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); + + boolean success = false; + if (reprocess) { + success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount); + } else { + success = processRecords(hbaseConf, cluster, batchSize, threadCount, + processFileSubstring); + } + + // Return the status + return success ? 0 : 1; + } + + /** + * Pick up the ranges of jobs to process from ProcessRecords. Skip raw rows + * that have already been processed. + * + * @param conf + * used to contact HBase and to run jobs against + * @param cluster + * for which to process records. + * @param batchSize + * the total number of jobs to process in a batch (a MR job scanning + * these many records in the raw table). + * @param threadCount + * how many parallel threads should be used to run Hadoop jobs in + * parallel. + * @param processFileSubstring + * Use only process records where the process file path contains this + * string. If null or empty string, then no filtering is + * applied. + * @return whether all job files for all processRecords were properly + * processed. + * @throws IOException + * @throws ClassNotFoundException + * when problems occur setting up the job. + * @throws InterruptedException + * @throws ExecutionException + * when at least one of the jobs could not be scheduled. + * @throws RowKeyParseException + */ + boolean processRecords(Configuration conf, String cluster, int batchSize, + int threadCount, String processFileSubstring) throws IOException, + InterruptedException, ClassNotFoundException, ExecutionException, + RowKeyParseException { + + List processRecords = getProcessRecords(conf, cluster, + processFileSubstring); + + // Bail out early if needed + if ((processRecords == null) || (processRecords.size() == 0)) { + return true; + } + + // Grab the min and the max jobId from all processing records. + MinMaxJobFileTracker minMaxJobFileTracker = new MinMaxJobFileTracker(); + + for (ProcessRecord processRecord : processRecords) { + minMaxJobFileTracker.track(processRecord.getMinJobId()); + minMaxJobFileTracker.track(processRecord.getMaxJobId()); + } + + List jobRunners = getJobRunners(conf, cluster, false, batchSize, + minMaxJobFileTracker.getMinJobId(), minMaxJobFileTracker.getMaxJobId()); + + boolean success = runJobs(threadCount, jobRunners); + if (success) { + updateProcessRecords(conf, processRecords); + } + + return success; + } + + /** + * @param conf + * used to contact HBase and to run jobs against + * @param cluster + * for which to process records. + * @param batchSize + * the total number of jobs to process in a batch (a MR job scanning + * these many records in the raw table). + * @param threadCount + * how many parallel threads should be used to run Hadoop jobs in + * parallel. + * @return whether all job files for all processRecords were properly + * processed. + * @throws IOException + * @throws ClassNotFoundException + * when problems occur setting up the job. + * @throws InterruptedException + * @throws ExecutionException + * when at least one of the jobs could not be scheduled. + * @throws RowKeyParseException + */ + boolean reProcessRecords(Configuration conf, String cluster, int batchSize, + int threadCount) throws IOException, InterruptedException, + ClassNotFoundException, ExecutionException, RowKeyParseException { + + List jobRunners = getJobRunners(conf, cluster, true, batchSize, + null, null); + + boolean success = runJobs(threadCount, jobRunners); + return success; + } + + /** + * Run the jobs and wait for all of them to complete. + * + * @param threadCount + * up to how many jobs to run in parallel + * @param jobRunners + * the list of jobs to run. + * @return whether all jobs completed successfully or not. + * @throws InterruptedException + * when interrupted while running jobs. + * @throws ExecutionException + * when at least one of the jobs could not be scheduled. + */ + private boolean runJobs(int threadCount, List jobRunners) + throws InterruptedException, ExecutionException { + ExecutorService execSvc = Executors.newFixedThreadPool(threadCount); + + if ((jobRunners == null) || (jobRunners.size() == 0)) { + return true; + } + + boolean success = true; + try { + List> jobFutures = new LinkedList>(); + for (JobRunner jobRunner : jobRunners) { + Future jobFuture = execSvc.submit(jobRunner); + jobFutures.add(jobFuture); + } + + // Wait for all jobs to complete. + for (Future jobFuture : jobFutures) { + success = jobFuture.get(); + if (!success) { + // Stop the presses as soon as we see an error. Note that several + // other jobs may have already been scheduled. Others will never be + // scheduled. + break; + } + } + } finally { + // Shut down the executor so that the JVM can exit. + List neverRan = execSvc.shutdownNow(); + if (neverRan != null && neverRan.size() > 0) { + System.err + .println("Interrupted run. Currently running Hadoop jobs will continue unless cancelled. " + + neverRan + " jobs never scheduled."); + } + } + return success; + } + + /** + * @param conf + * to be used to connect to HBase + * @param cluster + * for which we're finding processRecords. + * @param processFileSubstring + * if specified, this string must be part of the processFile path to + * limit which records we want to process. + * @return a list of processRecords in {@link ProcessState#LOADED} stqte that + * still need to be processed. + * @throws IOException + */ + private List getProcessRecords(Configuration conf, + String cluster, String processFileSubstring) throws IOException { + ProcessRecordService processRecordService = new ProcessRecordService(conf); + IOException caught = null; + List processRecords = null; + try { + // Grab all records. + processRecords = processRecordService.getProcessRecords(cluster, LOADED, + Integer.MAX_VALUE, processFileSubstring); + + LOG.info("Processing " + processRecords.size() + " for: " + cluster); + } catch (IOException ioe) { + caught = ioe; + } finally { + try { + processRecordService.close(); + } catch (IOException ioe) { + if (caught == null) { + caught = ioe; + } + } + if (caught != null) { + throw caught; + } + } + return processRecords; + } + + /** + * @param conf + * to be used to connect to HBase + * @param cluster + * for which we're finding processRecords. + * @param processFileSubstring + * if specified, this string must be part of the processFile path to + * limit which records we want to process. + * @return a list of processRecords in {@link ProcessState#LOADED} stqte that + * still need to be processed. + * @throws IOException + */ + private void updateProcessRecords(Configuration conf, + List processRecords) throws IOException { + ProcessRecordService processRecordService = new ProcessRecordService(conf); + IOException caught = null; + try { + + for (ProcessRecord processRecord : processRecords) { + // Even if we get an exception, still try to set the other records + try { + processRecordService.setProcessState(processRecord, PROCESSED); + } catch (IOException ioe) { + caught = ioe; + } + } + + } finally { + try { + processRecordService.close(); + } catch (IOException ioe) { + if (caught == null) { + caught = ioe; + } + } + if (caught != null) { + throw caught; + } + } + } + + /** + * @param conf + * used to connect to HBAse + * @param cluster + * for which we are processing + * @param reprocess + * Reprocess those records that may have been processed already. + * Otherwise successfully processed job files are skipped. + * @param reprocessOnly + * process only those raw records that were marked to be reprocessed. + * When true then reprocess argument is ignored and is assumed to be + * true. + * @param batchSize + * the total number of jobs to process in a batch (a MR job scanning + * these many records in the raw table). + * @param minJobId + * used to start the scan. If null then there is no min limit on + * JobId. + * @param maxJobId + * used to end the scan (inclusive). If null then there is no max + * limit on jobId. + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + * @throws ExecutionException + * @throws RowKeyParseException + */ + private List getJobRunners(Configuration conf, String cluster, + boolean reprocess, int batchSize, String minJobId, String maxJobId) + throws IOException, InterruptedException, ClassNotFoundException, + RowKeyParseException { + List jobRunners = new LinkedList(); + + JobHistoryRawService jobHistoryRawService = new JobHistoryRawService(conf); + try { + + // Bind all MR jobs together with one runID. + long now = System.currentTimeMillis(); + conf.setLong(Constants.MR_RUN_CONF_KEY, now); + + List scanList = jobHistoryRawService.getHistoryRawTableScans( + cluster, minJobId, maxJobId, reprocess, batchSize); + + for (Scan scan : scanList) { + Job job = getProcessingJob(conf, scan, scanList.size()); + + JobRunner jobRunner = new JobRunner(job, null); + jobRunners.add(jobRunner); + } + + } finally { + IOException caught = null; + try { + jobHistoryRawService.close(); + } catch (IOException ioe) { + caught = ioe; + } + + if (caught != null) { + throw caught; + } + } + return jobRunners; + + } + + /** + * @param conf + * to use to create and run the job + * @param scan + * to be used to scan the raw table. + * @param totalJobCount + * the total number of jobs that need to be run in this batch. Used + * in job name. + * @return The job to be submitted to the cluster. + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) + throws IOException { + + Configuration confClone = new Configuration(conf); + + // Turn off speculative execution. + // Note: must be BEFORE the job construction with the new mapreduce API. + confClone.setBoolean("mapred.map.tasks.speculative.execution", false); + + // Set up job + Job job = new Job(confClone, getJobName(totalJobCount)); + + // This is a map-only class, skip reduce step + job.setNumReduceTasks(0); + job.setJarByClass(JobFileProcessor.class); + job.setOutputFormatClass(MultiTableOutputFormat.class); + + TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, + JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(), + JobFileTableMapper.getOutputValueClass(), job); + + return job; + } + + /** + * @param totalJobCount + * how many jobs there will be in total. Used as indicator in the + * name how far along this job is. + * @return the name to use for each consecutive Hadoop job to launch. + */ + private synchronized String getJobName(int totalJobCount) { + String jobName = NAME + " [" + startTimestamp + " " + + jobCounter.incrementAndGet() + "/" + totalJobCount + "]"; + return jobName; + } + + /** + * DoIt. + * + * @param args + * the arguments to do it with + */ + public static void main(String[] args) { + try { + ToolRunner.run(new JobFileProcessor(), args); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileRawLoader.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileRawLoader.java new file mode 100644 index 0000000..19b4c87 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobFileRawLoader.java @@ -0,0 +1,352 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static com.twitter.hraven.etl.ProcessState.PREPROCESSED; + +import java.io.IOException; +import java.util.Date; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; +import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import com.twitter.hraven.Constants; +import com.twitter.hraven.mapreduce.JobFileRawLoaderMapper; + +/** + * Used to load the job files from an HDFS directory to an HBase table. This is + * just the raw loading part of the process. A process table is used to record + * the lowest job_id encountered during this load. + * + */ +public class JobFileRawLoader extends Configured implements Tool { + + final static String NAME = JobFileRawLoader.class.getSimpleName(); + private static Log LOG = LogFactory.getLog(JobFileRawLoader.class); + + private final String startTimestamp = Constants.TIMESTAMP_FORMAT + .format(new Date(System.currentTimeMillis())); + + private final AtomicInteger jobCounter = new AtomicInteger(0); + + /** + * Used to read files from (if input is HDFS) and to write output to. + */ + FileSystem hdfs; + + /** + * Default constructor + */ + public JobFileRawLoader() { + } + + /** + * Used for injecting confs while unit testing + * + * @param conf + */ + public JobFileRawLoader(Configuration conf) { + super(conf); + + } + + /** + * Parse command-line arguments. + * + * @param args + * command line arguments passed to program. + * @return parsed command line. + * @throws ParseException + */ + private static CommandLine parseArgs(String[] args) throws ParseException { + Options options = new Options(); + + // Cluster + Option o = new Option("c", "cluster", true, + "cluster for which jobs are processed"); + o.setArgName("cluster"); + o.setRequired(true); + options.addOption(o); + + o = new Option( + "p", + "processFileSubstring", + true, + "use only those process records where the process file path contains the provided string. Useful when processing production jobs in parallel to historic loads."); + o.setArgName("processFileSubstring"); + o.setRequired(false); + options.addOption(o); + + // Force + o = new Option("f", "forceReprocess", false, + "Force all jobs for which a jobFile is loaded to be reprocessed. Optional. Default is false."); + o.setRequired(false); + options.addOption(o); + + // Debugging + options.addOption("d", "debug", false, "switch on DEBUG log level"); + + CommandLineParser parser = new PosixParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + System.err.println("ERROR: " + e.getMessage() + "\n"); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(NAME + " ", options, true); + System.exit(-1); + } + + // Set debug level right away + if (commandLine.hasOption("d")) { + Logger log = Logger.getLogger(JobFileRawLoader.class); + log.setLevel(Level.DEBUG); + } + + return commandLine; + } + + /* + * (non-Javadoc) + * + * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) + */ + public int run(String[] args) throws ParseException, IOException, + ClassNotFoundException, InterruptedException { + + Configuration myHBaseConf = HBaseConfiguration.create(getConf()); + hdfs = FileSystem.get(myHBaseConf); + + // Grab input args and allow for -Dxyz style arguments + String[] otherArgs = new GenericOptionsParser(myHBaseConf, args) + .getRemainingArgs(); + + // Grab the arguments we're looking for. + CommandLine commandLine = parseArgs(otherArgs); + + String input = null; + boolean inputSpecified = commandLine.hasOption("i"); + if (inputSpecified) { + // Grab the input path argument + input = commandLine.getOptionValue("i"); + LOG.info("input=" + input); + } else { + LOG.info("Processing input from HBase ProcessRecords"); + } + + // Grab the cluster argument + String cluster = commandLine.getOptionValue("c"); + LOG.info("cluster=" + cluster); + + String processFileSubstring = null; + if (commandLine.hasOption("p")) { + processFileSubstring = commandLine.getOptionValue("p"); + } + LOG.info("processFileSubstring=" + processFileSubstring); + + boolean forceReprocess = commandLine.hasOption("f"); + LOG.info("forceReprocess: " + forceReprocess); + + // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have + // history files exceeding that. Disable limit. + myHBaseConf.setInt("hbase.client.keyvalue.maxsize", 0); + + // Shove this into the jobConf so that we can get it out on the task side. + myHBaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); + + boolean success = processRecordsFromHBase(myHBaseConf, cluster, + processFileSubstring, forceReprocess); + + // Return the status + return success ? 0 : 1; + } + + /** + * @param myHBaseConf + * used to contact HBase and to run jobs against. Should be an HBase + * configuration. + * @param cluster + * for which to process records. + * @param processFileSubstring + * return rows where the process file path contains this string. If + * null or empty string, then no filtering is applied. + * @param forceReprocess + * whether all jobs for which a file is loaded needs to be + * reprocessed. + * @return whether all job files for all processRecords were properly + * processed. + * @throws IOException + * @throws ClassNotFoundException + * when problems occur setting up the job. + * @throws InterruptedException + */ + private boolean processRecordsFromHBase(Configuration myHBaseConf, + String cluster, String processFileSubstring, boolean forceReprocess) + throws IOException, InterruptedException, ClassNotFoundException { + + int failures = 0; + + ProcessRecordService processRecordService = new ProcessRecordService( + myHBaseConf); + // Grab all records. + List processRecords = processRecordService + .getProcessRecords(cluster, PREPROCESSED, Integer.MAX_VALUE, + processFileSubstring); + try { + + LOG.info("ProcessRecords for " + cluster + ": " + processRecords.size()); + + // Bind all MR jobs together with one runID. + long now = System.currentTimeMillis(); + myHBaseConf.setLong(Constants.MR_RUN_CONF_KEY, now); + + myHBaseConf.setBoolean(Constants.FORCE_REPROCESS_CONF_KEY, forceReprocess); + + // Iterate over 0 based list in reverse order + for (int j = processRecords.size() - 1; j >= 0; j--) { + ProcessRecord processRecord = processRecords.get(j); + + LOG.info("Processing " + processRecord); + + boolean success = runRawLoaderJob(myHBaseConf, + processRecord.getProcessFile(), processRecords.size()); + // Bail out on first failure. + if (success) { + processRecordService.setProcessState(processRecord, + ProcessState.LOADED); + } else { + failures++; + } + + } + } finally { + processRecordService.close(); + } + + return (failures == 0); + } + + /** + * @param conf + * to use to create and run the job. Should be an HBase + * configuration. + * @param input + * path to the processFile * @param totalJobCount the total number of + * jobs that need to be run in this batch. Used in job name. + * @return whether all job confs were loaded properly. + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + private boolean runRawLoaderJob(Configuration myHBaseConf, String input, + int totalJobCount) throws IOException, InterruptedException, + ClassNotFoundException { + boolean success; + + // Turn off speculative execution. + // Note: must be BEFORE the job construction with the new mapreduce API. + myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false); + + // Set up job + Job job = new Job(myHBaseConf, getJobName(totalJobCount)); + job.setJarByClass(JobFileRawLoader.class); + + Path inputPath = new Path(input); + + if (hdfs.exists(inputPath)) { + + // Set input + job.setInputFormatClass(SequenceFileInputFormat.class); + SequenceFileInputFormat.setInputPaths(job, inputPath); + + job.setMapperClass(JobFileRawLoaderMapper.class); + + // Set the output format to push data into HBase. + job.setOutputFormatClass(TableOutputFormat.class); + TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, + job); + + job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass()); + job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass()); + + // This is a map-only class, skip reduce step + job.setNumReduceTasks(0); + + // Run the job + success = job.waitForCompletion(true); + + if (success) { + success = hdfs.delete(inputPath, false); + } + + } else { + System.err.println("Unable to find processFile: " + inputPath); + success = false; + } + return success; + } + + /** + * @param totalJobCount + * how many jobs there will be in total. Used as indicator in the + * name how far along this job is. + * @return the name to use for each consecutive Hadoop job to launch. + */ + private synchronized String getJobName(int totalJobCount) { + String jobName = NAME + " [" + startTimestamp + " " + + jobCounter.incrementAndGet() + "/" + totalJobCount + "]"; + return jobName; + } + + /** + * DoIt. + * + * @param args + * the arguments to do it with + */ + public static void main(String[] args) { + try { + ToolRunner.run(new JobFileRawLoader(), args); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOG.error("Error running job.", e); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/JobRunner.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobRunner.java new file mode 100644 index 0000000..a8efc61 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/JobRunner.java @@ -0,0 +1,92 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.util.concurrent.Callable; + +import org.apache.hadoop.mapreduce.Job; + +/** + * Can be used to run a single Hadoop job. The {@link #call()} method will block + * until the job is complete and will return a non-null return value indicating + * the success of the Hadoop job. + */ +public class JobRunner implements Callable { + + private volatile boolean isCalled = false; + private final Job job; + + /** + * Post processing step that gets called upon successful completion of the + * Hadoop job. + */ + private final Callable postProcessor; + + /** + * Constructor + * + * @param job + * to job to run in the call method. + * @param postProcessor + * Post processing step that gets called upon successful completion + * of the Hadoop job. Can be null, in which case it will be skipped. + * Final results will be the return value of this final processing + * step. + */ + public JobRunner(Job job, Callable postProcessor) { + this.job = job; + this.postProcessor = postProcessor; + } + + /* + * (non-Javadoc) + * + * @see java.util.concurrent.Callable#call() + */ + @Override + public Boolean call() throws Exception { + + // Guard to make sure we get called only once. + if (isCalled) { + return false; + } else { + isCalled = true; + } + + if (job == null) { + return false; + } + + boolean success = false; + // Schedule the job on the JobTracker and wait for it to complete. + try { + success = job.waitForCompletion(true); + } catch (InterruptedException interuptus) { + // We're told to stop, so honor that. + // And restore interupt status. + Thread.currentThread().interrupt(); + // Indicate that we should NOT run the postProcessor. + success = false; + } + + if (success && (postProcessor != null)) { + success = postProcessor.call(); + } + + return success; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/MinMaxJobFileTracker.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/MinMaxJobFileTracker.java new file mode 100644 index 0000000..a649066 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/MinMaxJobFileTracker.java @@ -0,0 +1,138 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import org.apache.hadoop.fs.FileStatus; + +import com.twitter.hraven.JobId; + +/** + * Used to track the min and max JobId as well as file modification time of + * files encountered + */ +public class MinMaxJobFileTracker { + + /** + * Keep track of the minimum job ID that we have seen to later update the + * processing record. + */ + private JobId minJobId = null; + + /** + * Keep track of the maximum job ID that we have seen to later update the + * processing record. + */ + private JobId maxJobId = null; + + /** + * The minimum modification time of a file in milliseconds since January 1, + * 1970 UTC tracked so far. + */ + private long minModificationTimeMillis; + + /** + * The maximum modification time of a file in milliseconds since January 1, + * 1970 UTC tracked so far. + */ + private long maxModificationTimeMillis; + + /** + * Default constructor. + */ + public MinMaxJobFileTracker() { + // Initialize to guarantee that whatever value we see is the new min. + minModificationTimeMillis = Long.MAX_VALUE; + // Initialize to guarantee that whatever value we see is the new max. + maxModificationTimeMillis = 0L; + } + + /** + * Converts a jobFileStatus to a JobFile and tracks the min and max + * modification times and JobIds. + * + * @param jobFileStatus + * of a jobfile, must be a proper JobFile. Cannot be null. + * @return a JobFile for the given jobFileStatus. + */ + public JobFile track(FileStatus jobFileStatus) { + + String jobfileName = jobFileStatus.getPath().getName(); + JobFile jobFile = new JobFile(jobfileName); + + // Extra check, caller should already have taken care of this. + if (jobFile.isJobConfFile() || jobFile.isJobHistoryFile()) { + track(jobFile.getJobid()); + + long modificationTimeMillis = jobFileStatus.getModificationTime(); + if (modificationTimeMillis < minModificationTimeMillis) { + minModificationTimeMillis = modificationTimeMillis; + } + if (modificationTimeMillis > maxModificationTimeMillis) { + maxModificationTimeMillis = modificationTimeMillis; + } + } + return jobFile; + } + + /** + * @param jobIdString + * to be tracked. + */ + public void track(String jobIdString) { + JobId jobId = new JobId(jobIdString); + + // If the previous minimum is not set, or is larger than the new value, + // the new id is the min + if (minJobId == null || (minJobId.compareTo(jobId) > 0)) { + minJobId = jobId; + } + // Ditto for the max + if (maxJobId == null || (maxJobId.compareTo(jobId) < 0)) { + maxJobId = jobId; + } + } + + /** + * @return The minimum job ID that we have processed so far. + */ + public String getMinJobId() { + return minJobId.getJobIdString(); + } + + /** + * @return The maximum job ID that we have processed so far. + */ + public String getMaxJobId() { + return maxJobId.getJobIdString(); + } + + /** + * @return The minimum modification time of a file in milliseconds since + * January 1, 1970 UTC tracked so far. + */ + public long getMinModificationTimeMillis() { + return minModificationTimeMillis; + } + + /** + * @return The maximum modification time of a file in milliseconds since + * January 1, 1970 UTC tracked so far. + */ + public long getMaxModificationTimeMillis() { + return maxModificationTimeMillis; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecord.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecord.java new file mode 100644 index 0000000..3738828 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecord.java @@ -0,0 +1,263 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.util.Date; + +import com.twitter.hraven.Constants; + +/** + * Used to keep track of a JobFile processing run. + */ +public class ProcessRecord { + + /** + * Used to store this record in HBase. + */ + private final ProcessRecordKey key; + + /** + * The cluster on which the jobs ran that we are recording job history for. + */ + private final String cluster; + + /** + * Keeps track of the state of the processing of a bunch of job conf and job + * history files. + */ + private final ProcessState processState; + + /** + * The minimum modification time of a file in milliseconds since January 1, + * 1970 UTC (excluding) encountered in this batch of files. + */ + private final long minModificationTimeMillis; + + /** + * The maximum modification time of a file in milliseconds since January 1, + * 1970 UTC (including) encountered in this batch of files. + */ + private final long maxModificationTimeMillis; + + /** + * How many Job files have been processed successfully. For each job there + * would typically be 2 files: a conf and a history file. + */ + private final int processedJobFiles; + + /** + * The file in hdfs where the job file names are stored for this process run. + */ + private final String processFile; + + /** + * The minimum job ID in this batch of jobs. Used to efficiently scan the RAW + * table. Will not be available until {@link #getProcessState()} is in + * {@link ProcessState#PREPROCESSED} state or later. In other words, could be + * null when in {@link ProcessState#CREATED} state. + */ + private final String minJobId; + + /** + * The maximum job ID in this batch of jobs. Used to efficiently scan the RAW + * table. Will not be available until {@link #getProcessState()} is in + * {@link ProcessState#PREPROCESSED} state or later. In other words, could be + * null when in {@link ProcessState#CREATED} state. + */ + private final String maxJobId; + + // TODO: Add identifier who last wrote/updated a record. + + /** + * Representing one batch of JobFiles processed in its initial state. + * + * @param cluster + * the cluster for which we are processing job files. Any + * {@link Constants#SEP} sub-strings will be stripped out. + * @param minModificationTimeMillis + * The minimum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (excluding). + * @param maxModificationTimeMillis + * The maximum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (including). + * @param processedJobFiles + * How many Job files have been processed successfully. For each job + * there would typically be 2 files: a conf and a history file. + * @param processFile + * The file in hdfs where the job file names are stored for this + * process run. + */ + public ProcessRecord(String cluster, long minModificationTimeMillis, + long maxModificationTimeMillis, int processedJobFiles, + String processingDirectory) { + this(cluster, ProcessState.CREATED, minModificationTimeMillis, + maxModificationTimeMillis, processedJobFiles, processingDirectory, + null, null); + + } + + /** + * Representing one batch of JobFiles processed. + * + * @param cluster + * the cluster for which we are processing job files. Any + * {@link Constants#SEP} sub-strings will be stripped out. + * @param processState + * to indicate what kind of processing has happend on this batch of + * Job Conf and History files. + * @param minModificationTimeMillis + * The minimum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (excluding). + * @param maxModificationTimeMillis + * The maximum modification time of a file to be accepted in + * milliseconds since January 1, 1970 UTC (including). + * @param processedJobFiles + * How many Job files have been processed successfully. For each job + * there would typically be 2 files: a conf and a history file. + * @param processFile + * The file in hdfs where the job file names are stored for this + * process run. + * @param minJobId + * The minimum job ID in this batch of jobs. Used to efficiently scan + * the RAW table. Will not be available until + * {@link #getProcessState()} is in {@link ProcessState#PREPROCESSED} + * state or later. In other words, could be null when in + * {@link ProcessState#CREATED} state. + * @param maxJobId + * The maximum job ID in this batch of jobs. Used to efficiently scan + * the RAW table. Will not be available until + * {@link #getProcessState()} is in {@link ProcessState#PREPROCESSED} + * state or later. In other words, could be null when in + * {@link ProcessState#CREATED} state. + */ + public ProcessRecord(String cluster, ProcessState processState, + long minModificationTimeMillis, long maxModificationTimeMillis, + int processedJobFiles, String processFile, String minJobId, + String maxJobId) { + + this.key = new ProcessRecordKey(cluster, maxModificationTimeMillis); + // Note that we have NOT ripped out the separators here. + this.cluster = cluster; + this.processState = processState; + this.minModificationTimeMillis = minModificationTimeMillis; + this.maxModificationTimeMillis = maxModificationTimeMillis; + this.processedJobFiles = processedJobFiles; + this.processFile = processFile; + this.minJobId = minJobId; + this.maxJobId = maxJobId; + } + + /** + * @return the key to be used to store this processing record. It is stored so + * that records are ordered first by cluster and then with the most + * recent record first. + */ + public ProcessRecordKey getKey() { + return key; + } + + /** + * @return the cluster for which we are processing job files. + */ + public String getCluster() { + return cluster; + } + + /** + * @return the last processStae of this processRecord. + */ + public ProcessState getProcessState() { + return processState; + } + + /** + * @return The minimum modification time of a file in milliseconds since + * January 1, 1970 UTC (excluding) encountered in this batch of files. + */ + public long getMinModificationTimeMillis() { + return minModificationTimeMillis; + } + + /** + * @return The maximum modification time of a file in milliseconds since + * January 1, 1970 UTC (including) encountered in this batch of + * files.. + */ + public long getMaxModificationTimeMillis() { + return maxModificationTimeMillis; + } + + /** + * @return How many Job files have been processed successfully. For each job + * there would typically be 2 files: a conf and a history file. + */ + public int getProcessedJobFiles() { + return processedJobFiles; + } + + /** + * @return The file in hdfs where the job file names are stored for this process run. + */ + public String getProcessFile() { + return processFile; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + + String minTimestamp = Constants.TIMESTAMP_FORMAT.format(new Date( + minModificationTimeMillis)); + String maxTimestamp = Constants.TIMESTAMP_FORMAT.format(new Date( + maxModificationTimeMillis)); + + String me = ProcessRecord.class.getSimpleName(); + me += "(" + key + ") "; + me += minTimestamp + "-" + maxTimestamp + " "; + me += processState + ": "; + me += processedJobFiles + " job files in "; + me += processFile; + me += " minJobId: " + minJobId; + me += " maxJobId: " + maxJobId; + + return me; + } + + /** + * The minimum job ID in this batch of jobs. Used to efficiently scan the RAW + * table. Will not be available until {@link #getProcessState()} is in + * {@link ProcessState#PREPROCESSED} state or later. In other words, could be + * null when in {@link ProcessState#CREATED} state. + */ + public String getMinJobId() { + return minJobId; + } + + /** + * The maximum job ID in this batch of jobs. Used to efficiently scan the RAW + * table. Will not be available until {@link #getProcessState()} is in + * {@link ProcessState#PREPROCESSED} state or later. In other words, could be + * null when in {@link ProcessState#CREATED} state. + */ + public String getMaxJobId() { + return maxJobId; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKey.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKey.java new file mode 100644 index 0000000..e1a7451 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKey.java @@ -0,0 +1,54 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +/** + */ +public class ProcessRecordKey { + private final String cluster; + private final long timestamp; + + public ProcessRecordKey(String cluster, long timestamp) { + this.cluster = cluster; + this.timestamp = timestamp; + } + + public String getCluster() { + return cluster; + } + + public long getTimestamp() { + return timestamp; + } + + @Override + public boolean equals(Object other) { + if (other != null && other instanceof ProcessRecordKey) { + return cluster.equals(((ProcessRecordKey) other).getCluster()) && + timestamp == ((ProcessRecordKey) other).getTimestamp(); + } + return false; + } + + public String toString() { + return new StringBuilder("ProcessRecordKey[cluster=") + .append(cluster) + .append(", timestamp=") + .append(timestamp) + .append("]") + .toString(); + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKeyConverter.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKeyConverter.java new file mode 100644 index 0000000..9a0815a --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordKeyConverter.java @@ -0,0 +1,42 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import org.apache.hadoop.hbase.util.Bytes; +import com.twitter.hraven.Constants; +import com.twitter.hraven.datasource.ByteConverter; +import com.twitter.hraven.etl.ProcessRecordKey; +import com.twitter.hraven.util.ByteUtil; + +/** + */ +public class ProcessRecordKeyConverter implements ByteConverter { + @Override + public byte[] toBytes(ProcessRecordKey key) { + long invertedTimestamp = Long.MAX_VALUE - key.getTimestamp(); + return ByteUtil.join(Constants.SEP_BYTES, + Bytes.toBytes(key.getCluster()), + Bytes.toBytes(invertedTimestamp)); + } + + @Override + public ProcessRecordKey fromBytes(byte[] bytes) { + byte[][] parts = ByteUtil.split(bytes, Constants.SEP_BYTES, 2); + long invertedTimestamp = Bytes.toLong(parts[1]); + return new ProcessRecordKey(Bytes.toString(parts[0]), + Long.MAX_VALUE - invertedTimestamp); + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordService.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordService.java new file mode 100644 index 0000000..fc067b2 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordService.java @@ -0,0 +1,496 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static com.twitter.hraven.etl.ProcessState.CREATED; +import static org.apache.hadoop.hbase.filter.CompareFilter.CompareOp.EQUAL; +import static org.apache.hadoop.hbase.filter.CompareFilter.CompareOp.NOT_EQUAL; +import static org.apache.hadoop.hbase.filter.CompareFilter.CompareOp.NO_OP; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.TimeZone; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp; +import org.apache.hadoop.hbase.filter.Filter; +import org.apache.hadoop.hbase.filter.FilterList; +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; +import org.apache.hadoop.hbase.filter.SubstringComparator; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Writer; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.datasource.ProcessingException; +import com.twitter.hraven.etl.JobFile; +import com.twitter.hraven.etl.ProcessRecord; +import com.twitter.hraven.etl.ProcessRecordKey; +import com.twitter.hraven.etl.ProcessState; + +/** + * Used to store and retrieve {@link ProcessRecord} objects. + * + */ +public class ProcessRecordService { + /** + * Used to get the end of the day in millis like so yyyy-MM-dd HH:mm:ss.SSS + */ + public static final SimpleDateFormat MILLISECOND_TIMSTAMP_FORMAT = new SimpleDateFormat( + "yyyy-MM-dd HH:mm:ss.SSS"); + + // Initialize to use UTC + static { + TimeZone utc = TimeZone.getTimeZone("UTC"); + MILLISECOND_TIMSTAMP_FORMAT.setTimeZone(utc); + } + + private static Log LOG = LogFactory.getLog(ProcessRecordService.class); + + private ProcessRecordKeyConverter keyConv = new ProcessRecordKeyConverter(); + + /** + * Used to store the processRecords in HBase + */ + private final HTable processRecordTable; + + /** + * Used to access the filesystem. + */ + private final Configuration myHBaseConf; + + private final FileSystem fs; + + /** + * Constructor. Note that caller is responsible to {@link #close()} created + * instances. + * + * @param myHBaseConf + * configuration of the processing job, not the conf of the files we + * are processing. This should be an HBase conf so that we can access + * the appropriate cluster. + * @throws IOException + * in case we have problems connecting to HBase. + */ + public ProcessRecordService(Configuration myHBaseConf) throws IOException { + processRecordTable = new HTable(myHBaseConf, + Constants.JOB_FILE_PROCESS_TABLE_BYTES); + this.myHBaseConf = myHBaseConf; + fs = FileSystem.get(myHBaseConf); + } + + /** + * Write all fields of a record to HBase. To be used in initial insert, or to + * overwrite whatever values are there in HBase. + *

+ * Consider using {@link #setProcessState(ProcessRecord, ProcessState)} if you + * want to update only the state. + * + * @param processRecord + * non-null ProcessRecord to write to HBase. + * @throws IOException + * if the record cannot be written. + */ + public void writeJobRecord(ProcessRecord processRecord) throws IOException { + + byte[] key = keyConv.toBytes(processRecord.getKey()); + Put put = new Put(key); + + put.add(Constants.INFO_FAM_BYTES, + Constants.MIN_MOD_TIME_MILLIS_COLUMN_BYTES, + Bytes.toBytes(processRecord.getMinModificationTimeMillis())); + put.add(Constants.INFO_FAM_BYTES, + Constants.PROCESSED_JOB_FILES_COLUMN_BYTES, + Bytes.toBytes(processRecord.getProcessedJobFiles())); + put.add(Constants.INFO_FAM_BYTES, Constants.PROCESS_FILE_COLUMN_BYTES, + Bytes.toBytes(processRecord.getProcessFile())); + put.add(Constants.INFO_FAM_BYTES, Constants.PROCESSING_STATE_COLUMN_BYTES, + Bytes.toBytes(processRecord.getProcessState().getCode())); + put.add(Constants.INFO_FAM_BYTES, Constants.MIN_JOB_ID_COLUMN_BYTES, + Bytes.toBytes(processRecord.getMinJobId())); + put.add(Constants.INFO_FAM_BYTES, Constants.MAX_JOB_ID_COLUMN_BYTES, + Bytes.toBytes(processRecord.getMaxJobId())); + + processRecordTable.put(put); + } + + /** + * @param cluster + * for which to return the last ProcessRecord. + * @return the last process record that is not in {@link ProcessState#CREATED} + * state. + * @throws IOException + */ + public ProcessRecord getLastSuccessfulProcessRecord(String cluster) + throws IOException { + List processRecords = getProcessRecords(cluster, NOT_EQUAL, + CREATED, 1, null); + if (processRecords.size() > 0) { + return processRecords.get(0); + } + // Did not get any record. + return null; + } + + /** + * @param cluster + * for which to return the last ProcessRecord. + * @param maxCount + * the maximum number of results to return. + * @param processFileSubstring + * return rows where the process file path contains this string. If + * null or empty string, then no filtering is applied. + * @return the last process record that is not in {@link ProcessState#CREATED} + * state. Note that no records with a maxModificationTime of 0 + * (beginning of time) will be returned + * @throws IOException + */ + public List getProcessRecords(String cluster, int maxCount, + String processFileSubstring) throws IOException { + return getProcessRecords(cluster, NO_OP, null, maxCount, + processFileSubstring); + } + + /** + * @param cluster + * for which to return the last ProcessRecord. + * @param processState + * return only rows with this state + * @param maxCount + * the maximum number of results to return. + * @param processFileSubstring + * return rows where the process file path contains this string. If + * null or empty string, then no filtering is applied. + * @return the last process record that is not in {@link ProcessState#CREATED} + * state. Note that no records with a maxModificationTime of 0 + * (beginning of time) will be returned + * @throws IOException + */ + public List getProcessRecords(String cluster, + ProcessState processState, int maxCount, String processFileSubstring) + throws IOException { + return getProcessRecords(cluster, EQUAL, processState, maxCount, + processFileSubstring); + } + + /** + * @param cluster + * for which to return the last ProcessRecord. + * @param compareOp + * to apply to the processState argument. If {@link CompareOp#NO_OP} + * is passed, then no filter is used at all, and processState + * argument is ignored. + * @param processState + * return rows where the compareOp applies. + * @param maxCount + * the maximum number of results to return. + * @param processFileSubstring + * return rows where the process file path contains this string. If + * null or empty string, then no filtering is applied. + * @return the last process record that is not in {@link ProcessState#CREATED} + * state. Note that no records with a maxModificationTime of 0 + * (beginning of time) will be returned + * @throws IOException + */ + public List getProcessRecords(String cluster, + CompareOp compareOp, ProcessState processState, int maxCount, + String processFileSubstring) throws IOException { + Scan scan = new Scan(); + // Pull data only for our cluster + scan.setStartRow(keyConv.toBytes(new ProcessRecordKey(cluster, + Long.MAX_VALUE))); + // Records are sorted in reverse order, so the last one for this cluster + // would be the one with a modification time at the beginning of time. + scan.setStopRow(keyConv.toBytes(new ProcessRecordKey(cluster, 0))); + + scan.addColumn(Constants.INFO_FAM_BYTES, + Constants.MIN_MOD_TIME_MILLIS_COLUMN_BYTES); + scan.addColumn(Constants.INFO_FAM_BYTES, + Constants.PROCESSED_JOB_FILES_COLUMN_BYTES); + scan.addColumn(Constants.INFO_FAM_BYTES, + Constants.PROCESS_FILE_COLUMN_BYTES); + scan.addColumn(Constants.INFO_FAM_BYTES, + Constants.PROCESSING_STATE_COLUMN_BYTES); + scan.addColumn(Constants.INFO_FAM_BYTES, Constants.MIN_JOB_ID_COLUMN_BYTES); + scan.addColumn(Constants.INFO_FAM_BYTES, Constants.MAX_JOB_ID_COLUMN_BYTES); + scan.setMaxVersions(1); + + FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL); + + // Filter on process state only when needed. + if (!NO_OP.equals(compareOp)) { + byte[] filterColumnValue = Bytes.toBytes(processState.getCode()); + Filter processingStatefilter = new SingleColumnValueFilter( + Constants.INFO_FAM_BYTES, Constants.PROCESSING_STATE_COLUMN_BYTES, + compareOp, filterColumnValue); + filterList.addFilter(processingStatefilter); + } + + // Filter on process file only when needed + if (processFileSubstring != null && processFileSubstring.length() > 0) { + SubstringComparator ssc = new SubstringComparator(processFileSubstring); + Filter processFileFilter = new SingleColumnValueFilter( + Constants.INFO_FAM_BYTES, Constants.PROCESS_FILE_COLUMN_BYTES, EQUAL, + ssc); + filterList.addFilter(processFileFilter); + } + + // Add filters only if any filter was actually needed. + if (filterList.getFilters().size() > 0) { + scan.setFilter(filterList); + } + + ResultScanner scanner = null; + + List records = null; + try { + scanner = processRecordTable.getScanner(scan); + records = createFromResults(scanner, maxCount); + } finally { + if (scanner != null) { + scanner.close(); + } + } + + return records; + } + + /** + * Transform results pulled from a scanner and turn into a list of + * ProcessRecords. + * + * @param scanner + * used to pull the results from, in the order determined by the + * scanner. + * @param maxCount + * maximum number of results to return. + * @return + */ + private List createFromResults(ResultScanner scanner, + int maxCount) { + // Defensive coding + if ((maxCount <= 0) || (scanner == null)) { + return new ArrayList(0); + } + List records = new ArrayList(); + + for (Result result : scanner) { + byte[] row = result.getRow(); + ProcessRecordKey key = keyConv.fromBytes(row); + + KeyValue keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.MIN_MOD_TIME_MILLIS_COLUMN_BYTES); + long minModificationTimeMillis = Bytes.toLong(keyValue.getValue()); + + keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.PROCESSED_JOB_FILES_COLUMN_BYTES); + int processedJobFiles = Bytes.toInt(keyValue.getValue()); + + keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.PROCESS_FILE_COLUMN_BYTES); + String processingDirectory = Bytes.toString(keyValue.getValue()); + + keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.PROCESSING_STATE_COLUMN_BYTES); + ProcessState processState = ProcessState.getProcessState(Bytes + .toInt(keyValue.getValue())); + + keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.MIN_JOB_ID_COLUMN_BYTES); + String minJobId = null; + if (keyValue != null) { + minJobId = Bytes.toString(keyValue.getValue()); + } + + keyValue = result.getColumnLatest(Constants.INFO_FAM_BYTES, + Constants.MAX_JOB_ID_COLUMN_BYTES); + String maxJobId = null; + if (keyValue != null) { + maxJobId = Bytes.toString(keyValue.getValue()); + } + + ProcessRecord processRecord = new ProcessRecord(key.getCluster(), + processState, minModificationTimeMillis, key.getTimestamp(), + processedJobFiles, processingDirectory, minJobId, maxJobId); + records.add(processRecord); + + // Check if we retrieved enough records. + if (records.size() >= maxCount) { + break; + } + } + + LOG.info("Returning " + records.size() + " process records"); + + return records; + } + + /** + * Set the process state for a given processRecord. + * + * @param processRecord + * for which to update the state + * @param newState + * the new state to set in HBase. + * @return a new ProcessRecord with the new state. + * @throws IOException + */ + public ProcessRecord setProcessState(ProcessRecord processRecord, + ProcessState newState) throws IOException { + Put put = new Put(keyConv.toBytes(processRecord.getKey())); + put.add(Constants.INFO_FAM_BYTES, Constants.PROCESSING_STATE_COLUMN_BYTES, + Bytes.toBytes(newState.getCode())); + processRecordTable.put(put); + ProcessRecord updatedProcessRecord = new ProcessRecord( + processRecord.getCluster(), newState, + processRecord.getMinModificationTimeMillis(), + processRecord.getMaxModificationTimeMillis(), + processRecord.getProcessedJobFiles(), processRecord.getProcessFile(), + processRecord.getMinJobId(), processRecord.getMaxJobId()); + return updatedProcessRecord; + } + + /** + * @param year + * the year in 4 characters like "2012" + * @param month + * the month in 2 characters like "05" + * @param day + * the day in 2 characters like "08" + * @return End of the day in milliseconds since January 1, 1970 UTC + * (including) + */ + long getEndOfDayMillis(String year, String month, String day) { + // Assemble string in this format: yyyy-MM-dd HH:mm:ss.SSS + String endOfDay = year + "-" + month + "-" + day + " 23:59:59.999"; + try { + Date endOfDayDate = MILLISECOND_TIMSTAMP_FORMAT.parse(endOfDay); + return endOfDayDate.getTime(); + } catch (java.text.ParseException e) { + throw new IllegalArgumentException("Cannot parse: " + endOfDay); + } + } + + /** + * @param year + * the year in 4 characters like "2012" + * @param month + * the month in 2 characters like "05" + * @param day + * the day in 2 characters like "08" + * @return Start of the day in milliseconds since January 1, 1970 UTC + * (including) + */ + long getStartOfDayMillis(String year, String month, String day) { + // Assemble string in this format: yyyy-MM-dd HH:mm:ss.SSS + String startOfDay = year + "-" + month + "-" + day + " 00:00:00.000"; + try { + Date startOfDayDate = MILLISECOND_TIMSTAMP_FORMAT.parse(startOfDay); + return startOfDayDate.getTime(); + } catch (java.text.ParseException e) { + throw new IllegalArgumentException("Cannot parse: " + startOfDay); + } + } + + /** + * Release internal HBase table instances. Must be called when consumer is + * done with this service. + * + * @throws IOException + * when bad things happen closing HBase table(s). + */ + public void close() throws IOException { + if (processRecordTable != null) { + processRecordTable.close(); + } + } + + /** + * @param cluster + * the cluster on which the batch of jobs ran. + * @param batch + * indicating which batch this is. Used to make the filename unique. + * @return Path to a processFile in the /tmp directory on the filesystem. + */ + public Path getInitialProcessFile(String cluster, int batch) { + long now = System.currentTimeMillis(); + String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(now)); + + String safeCluster = ""; + if (cluster != null) { + // rip out everything that is not letter, number or underscore. + safeCluster = cluster.replaceAll("\\W+", ""); + } + + String processFileName = Constants.PROJECT_NAME + "-" + safeCluster + "-" + + timestamp + "-" + batch; + Path tmpDir = new Path("/tmp"); + Path processFile = new Path(tmpDir, processFileName); + return processFile; + } + + /** + * @param processFilePath + * where to write to. + * @return Writer for SequenceFile + * @throws IOException + * when bad things happen. + */ + public Writer createProcessFileWriter(Path processFilePath) + throws IOException { + Writer indexWriter = SequenceFile.createWriter(fs, myHBaseConf, + processFilePath, JobFile.class, FileStatus.class); + return indexWriter; + } + + /** + * @param initialProcessFile + * The path to the file to be moved. + * @param outputPath + * The path where this file is to be moved to. + * @return the new path or null if the rename failed. + * @throws IOException + * when bad things happen. + * @throws ProcessingException + * when the file cannot be moved. + */ + public Path moveProcessFile(Path initialProcessFile, Path outputPath) + throws IOException { + String processFileName = initialProcessFile.getName(); + Path processFile = new Path(outputPath, processFileName); + + boolean success = fs.rename(initialProcessFile, processFile); + if (!success) { + throw new ProcessingException("Unable to move processing file " + + initialProcessFile + " to " + processFile); + } + return processFile; + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordUpdater.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordUpdater.java new file mode 100644 index 0000000..0a7fa70 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessRecordUpdater.java @@ -0,0 +1,82 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.IOException; +import java.util.concurrent.Callable; + +import org.apache.hadoop.conf.Configuration; + +import com.twitter.hraven.etl.ProcessRecordService; + +/** + * Updates a processRecord to the given status when called. + * */ +public class ProcessRecordUpdater implements Callable { + + /** + * Which is to be updated. + */ + private final ProcessRecord processRecord; + + /** + * The new state to set the record to using the service. + */ + private final ProcessState newState; + + /** + * Used to connect to HBase. + */ + private final Configuration hBaseconf; + + /** + * @param hBaseconf + * used to connect to HBase + * @throws IOException + */ + public ProcessRecordUpdater(Configuration hBaseconf, + ProcessRecord processRecord, ProcessState newState) throws IOException { + this.hBaseconf = hBaseconf; + this.processRecord = processRecord; + this.newState = newState; + } + + /* + * (non-Javadoc) + * + * @see java.util.concurrent.Callable#call() + */ + @Override + public Boolean call() throws Exception { + + ProcessRecord updatedRecord = null; + // Connect only when needed. + ProcessRecordService processRecordService = new ProcessRecordService( + hBaseconf); + try { + updatedRecord = processRecordService.setProcessState(processRecord, + newState); + } finally { + processRecordService.close(); + } + if ((updatedRecord != null) + && (updatedRecord.getProcessState() == newState)) { + return true; + } + return false; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessState.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessState.java new file mode 100644 index 0000000..3be28dc --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessState.java @@ -0,0 +1,81 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +/** + * Keeps track of the state of the processing of a bunch of job conf and job + * history files. + * + */ +public enum ProcessState { + + /** + * When processing has just started, but no complete set of job files has been + * moved to the processing directory yet. + */ + CREATED(0), + + /** + * Pre-processing step is complete. The number of processed files will + * indicate how many files have been processed. + */ + PREPROCESSED(1), + + /** + * The loading step is complete. The number of processed files will indicate + * how many files have been processed. The record will now also have a min and + * a max job ID processed. + */ + LOADED(2), + + /** + * All job files between the min and the max job ID for a given cluster are + * processed. + */ + PROCESSED(3); + + /** + * Representing this state. + */ + private final int code; + + private ProcessState(int code) { + this.code = code; + } + + /** + * @return the code for this state + */ + public int getCode() { + return code; + } + + /** + * @param code + * representing the state + * @return the ProcessState for this code, or if not recognized, then return + * {@link ProcessState#CREATED} + */ + public static ProcessState getProcessState(int code) { + for (ProcessState state : ProcessState.values()) { + if (state.getCode() == code) { + return state; + } + } + return CREATED; + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessingRecordsPrinter.java b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessingRecordsPrinter.java new file mode 100644 index 0000000..ca95a25 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/etl/ProcessingRecordsPrinter.java @@ -0,0 +1,222 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import com.twitter.hraven.etl.ProcessRecordService; + +/** + * Small utility used to print all processing records given a cluster. + * + */ +public class ProcessingRecordsPrinter extends Configured implements Tool { + + final static String NAME = ProcessingRecordsPrinter.class.getSimpleName(); + private static Log LOG = LogFactory.getLog(ProcessingRecordsPrinter.class); + + /** + * Default constructor. + */ + public ProcessingRecordsPrinter() { + } + + /** + * Used for injecting confs while unit testing + * + * @param conf + */ + public ProcessingRecordsPrinter(Configuration conf) { + super(conf); + } + + /** + * Parse command-line arguments. + * + * @param args + * command line arguments passed to program. + * @return parsed command line. + * @throws ParseException + */ + private static CommandLine parseArgs(String[] args) throws ParseException { + Options options = new Options(); + + // Input + Option o = new Option("m", "maxCount", true, + "maximum number of records to be returned"); + o.setArgName("maxCount"); + o.setRequired(false); + options.addOption(o); + + o = new Option("c", "cluster", true, "cluster for which jobs are processed"); + o.setArgName("cluster"); + o.setRequired(true); + options.addOption(o); + + o = new Option( + "p", + "processFileSubstring", + true, + "use only those process records where the process file path contains the provided string."); + o.setArgName("processFileSubstring"); + o.setRequired(false); + options.addOption(o); + + // Debugging + options.addOption("d", "debug", false, "switch on DEBUG log level"); + + CommandLineParser parser = new PosixParser(); + CommandLine commandLine = null; + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + System.err.println("ERROR: " + e.getMessage() + "\n"); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(NAME + " ", options, true); + System.exit(-1); + } + + // Set debug level right away + if (commandLine.hasOption("d")) { + Logger log = Logger.getLogger(ProcessingRecordsPrinter.class); + log.setLevel(Level.DEBUG); + } + + return commandLine; + } + + /* + * (non-Javadoc) + * + * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) + */ + public int run(String[] args) throws Exception { + + Configuration hbaseConf = HBaseConfiguration.create(getConf()); + + // Grab input args and allow for -Dxyz style arguments + String[] otherArgs = new GenericOptionsParser(hbaseConf, args) + .getRemainingArgs(); + + // Grab the arguments we're looking for. + CommandLine commandLine = parseArgs(otherArgs); + + // Grab the cluster argument + String cluster = commandLine.getOptionValue("c"); + LOG.info("cluster=" + cluster); + + // Grab the cluster argument + + String processFileSubstring = null; + if (commandLine.hasOption("p")) { + processFileSubstring = commandLine.getOptionValue("p"); + } + LOG.info("processFileSubstring=" + processFileSubstring); + + // Default to no max + Integer maxCount = Integer.MAX_VALUE; + if (commandLine.hasOption("m")) { + try { + maxCount = Integer.parseInt(commandLine.getOptionValue("m")); + } catch (NumberFormatException nfe) { + System.err.println("Error: " + NAME + " maxCount is not an integer: " + + commandLine.getOptionValue("m")); + } + } + + boolean success = printProcessRecordsFromHBase(hbaseConf, cluster, + maxCount, processFileSubstring); + + // Return the status + return success ? 0 : 1; + } + + /** + * @param conf + * used to contact HBase and to run jobs against + * @param cluster + * for which to process records. + * @param processFileSubstring + * return rows where the process file path contains this string. If + * null or empty string, then no filtering is applied. + * @return whether all job files for all processRecords were properly Printed. + * @throws IOException + */ + private boolean printProcessRecordsFromHBase(Configuration conf, + String cluster, int maxCount, String processFileSubstring) + throws IOException { + ProcessRecordService processRecordService = new ProcessRecordService(conf); + List processRecords = processRecordService + .getProcessRecords(cluster, maxCount, processFileSubstring); + try { + + int jobFileCount = 0; + + System.out.println("ProcessRecords for " + cluster + ": " + + processRecords.size()); + + // Iterate over 0 based list in reverse order + for (int j = processRecords.size() - 1; j >= 0; j--) { + ProcessRecord processRecord = processRecords.get(j); + + // Print the whole thing. + System.out.println(processRecord); + jobFileCount += processRecord.getProcessedJobFiles(); + } + System.out.println("Printed " + processRecords.size() + + " records with a total of " + jobFileCount + " files."); + } finally { + processRecordService.close(); + } + + return true; + } + + /** + * DoIt. + * + * @param args + * the arguments to do it with + */ + public static void main(String[] args) { + try { + ToolRunner.run(new ProcessingRecordsPrinter(), args); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/CombineFileInputFormat.java b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/CombineFileInputFormat.java new file mode 100644 index 0000000..b7c5ffc --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/CombineFileInputFormat.java @@ -0,0 +1,626 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.twitter.hraven.mapreduce; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.net.NetworkTopology; +import org.apache.hadoop.net.NodeBase; + +/** + * An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in + * {@link InputFormat#getSplits(JobContext)} method. + * + * Splits are constructed from the files under the input paths. + * A split cannot have files from different pools. + * Each split returned may contain blocks from different files. + * If a maxSplitSize is specified, then blocks on the same node are + * combined to form a single split. Blocks that are left over are + * then combined with other blocks in the same rack. + * If maxSplitSize is not specified, then blocks from the same rack + * are combined in a single split; no attempt is made to create + * node-local splits. + * If the maxSplitSize is equal to the block size, then this class + * is similar to the default splitting behavior in Hadoop: each + * block is a locally processed split. + * Subclasses implement + * {@link InputFormat#createRecordReader(InputSplit, TaskAttemptContext)} + * to construct RecordReader's for + * CombineFileSplit's. + * + * @see CombineFileSplit + */ +public abstract class CombineFileInputFormat + extends FileInputFormat { + + // ability to limit the size of a single split + private long maxSplitSize = 0; + private long minSplitSizeNode = 0; + private long minSplitSizeRack = 0; + + // A pool of input paths filters. A split cannot have blocks from files + // across multiple pools. + private ArrayList pools = new ArrayList(); + + // mapping from a rack name to the set of Nodes in the rack + private HashMap> rackToNodes = + new HashMap>(); + /** + * Specify the maximum size (in bytes) of each split. Each split is + * approximately equal to the specified size. + */ + protected void setMaxSplitSize(long maxSplitSize) { + this.maxSplitSize = maxSplitSize; + } + + /** + * Specify the minimum size (in bytes) of each split per node. + * This applies to data that is left over after combining data on a single + * node into splits that are of maximum size specified by maxSplitSize. + * This leftover data will be combined into its own split if its size + * exceeds minSplitSizeNode. + */ + protected void setMinSplitSizeNode(long minSplitSizeNode) { + this.minSplitSizeNode = minSplitSizeNode; + } + + /** + * Specify the minimum size (in bytes) of each split per rack. + * This applies to data that is left over after combining data on a single + * rack into splits that are of maximum size specified by maxSplitSize. + * This leftover data will be combined into its own split if its size + * exceeds minSplitSizeRack. + */ + protected void setMinSplitSizeRack(long minSplitSizeRack) { + this.minSplitSizeRack = minSplitSizeRack; + } + + /** + * Create a new pool and add the filters to it. + * A split cannot have files from different pools. + */ + protected void createPool(List filters) { + pools.add(new MultiPathFilter(filters)); + } + + /** + * Create a new pool and add the filters to it. + * A pathname can satisfy any one of the specified filters. + * A split cannot have files from different pools. + */ + protected void createPool(PathFilter... filters) { + MultiPathFilter multi = new MultiPathFilter(); + for (PathFilter f: filters) { + multi.add(f); + } + pools.add(multi); + } + + /** + * default constructor + */ + public CombineFileInputFormat() { + } + + @Override + public List getSplits(JobContext job) + throws IOException { + + long minSizeNode = 0; + long minSizeRack = 0; + long maxSize = 0; + Configuration conf = job.getConfiguration(); + + // the values specified by setxxxSplitSize() takes precedence over the + // values that might have been specified in the config + if (minSplitSizeNode != 0) { + minSizeNode = minSplitSizeNode; + } else { + minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0); + } + if (minSplitSizeRack != 0) { + minSizeRack = minSplitSizeRack; + } else { + minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0); + } + if (maxSplitSize != 0) { + maxSize = maxSplitSize; + } else { + maxSize = conf.getLong("mapred.max.split.size", 0); + } + if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { + throw new IOException("Minimum split size pernode " + minSizeNode + + " cannot be larger than maximum split size " + + maxSize); + } + if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { + throw new IOException("Minimum split size per rack" + minSizeRack + + " cannot be larger than maximum split size " + + maxSize); + } + if (minSizeRack != 0 && minSizeNode > minSizeRack) { + throw new IOException("Minimum split size per node" + minSizeNode + + " cannot be smaller than minimum split " + + "size per rack " + minSizeRack); + } + + // all the files in input set + Path[] paths = FileUtil.stat2Paths( + listStatus(job).toArray(new FileStatus[0])); + List splits = new ArrayList(); + if (paths.length == 0) { + return splits; + } + + // Convert them to Paths first. This is a costly operation and + // we should do it first, otherwise we will incur doing it multiple + // times, one time each for each pool in the next loop. + List newpaths = new LinkedList(); + for (int i = 0; i < paths.length; i++) { + Path p = new Path(paths[i].toUri().getPath()); + newpaths.add(p); + } + paths = null; + + System.out.println("Getting splits for: " + newpaths.size() + " paths."); + + // In one single iteration, process all the paths in a single pool. + // Processing one pool at a time ensures that a split contains paths + // from a single pool only. + for (MultiPathFilter onepool : pools) { + ArrayList myPaths = new ArrayList(); + + System.out.println("Getting splits for a pool"); + + // pick one input path. If it matches all the filters in a pool, + // add it to the output set + for (Iterator iter = newpaths.iterator(); iter.hasNext();) { + Path p = iter.next(); + if (onepool.accept(p)) { + myPaths.add(p); // add it to my output set + iter.remove(); + } + } + System.out.println("Getting splits. myPaths size: " + myPaths.size()); + // create splits for all files in this pool. + getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), + maxSize, minSizeNode, minSizeRack, splits); + } + + // create splits for all files that are not in any pool. + getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), + maxSize, minSizeNode, minSizeRack, splits); + + // free up rackToNodes map + rackToNodes.clear(); + return splits; + } + + /** + * Return all the splits in the specified set of paths + */ + private void getMoreSplits(Configuration conf, Path[] paths, + long maxSize, long minSizeNode, long minSizeRack, + List splits) + throws IOException { + + // all blocks for all the files in input set + OneFileInfo[] files; + + // mapping from a rack name to the list of blocks it has + HashMap> rackToBlocks = + new HashMap>(); + + // mapping from a block to the nodes on which it has replicas + HashMap blockToNodes = + new HashMap(); + + // mapping from a node to the list of blocks that it contains + HashMap> nodeToBlocks = + new HashMap>(); + + files = new OneFileInfo[paths.length]; + if (paths.length == 0) { + return; + } + + // populate all the blocks for all files + long totLength = 0; + for (int i = 0; i < paths.length; i++) { + files[i] = new OneFileInfo(paths[i], conf, + rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes); + totLength += files[i].getLength(); + } + + ArrayList validBlocks = new ArrayList(); + ArrayList nodes = new ArrayList(); + long curSplitSize = 0; + + // process all nodes and create splits that are local + // to a node. + for (Iterator>> iter = nodeToBlocks.entrySet().iterator(); + iter.hasNext();) { + + Map.Entry> one = iter.next(); + nodes.add(one.getKey()); + List blocksInNode = one.getValue(); + + // for each block, copy it into validBlocks. Delete it from + // blockToNodes so that the same block does not appear in + // two different splits. + for (OneBlockInfo oneblock : blocksInNode) { + if (blockToNodes.containsKey(oneblock)) { + validBlocks.add(oneblock); + blockToNodes.remove(oneblock); + curSplitSize += oneblock.length; + + // if the accumulated split size exceeds the maximum, then + // create this split. + if (maxSize != 0 && curSplitSize >= maxSize) { + // create an input split and add it to the splits array + addCreatedSplit(splits, nodes, validBlocks); + curSplitSize = 0; + validBlocks.clear(); + } + } + } + // if there were any blocks left over and their combined size is + // larger than minSplitNode, then combine them into one split. + // Otherwise add them back to the unprocessed pool. It is likely + // that they will be combined with other blocks from the + // same rack later on. + if (minSizeNode != 0 && curSplitSize >= minSizeNode) { + // create an input split and add it to the splits array + addCreatedSplit(splits, nodes, validBlocks); + } else { + for (OneBlockInfo oneblock : validBlocks) { + blockToNodes.put(oneblock, oneblock.hosts); + } + } + validBlocks.clear(); + nodes.clear(); + curSplitSize = 0; + } + + // if blocks in a rack are below the specified minimum size, then keep them + // in 'overflow'. After the processing of all racks is complete, these + // overflow blocks will be combined into splits. + ArrayList overflowBlocks = new ArrayList(); + ArrayList racks = new ArrayList(); + + // Process all racks over and over again until there is no more work to do. + while (blockToNodes.size() > 0) { + + // Create one split for this rack before moving over to the next rack. + // Come back to this rack after creating a single split for each of the + // remaining racks. + // Process one rack location at a time, Combine all possible blocks that + // reside on this rack as one split. (constrained by minimum and maximum + // split size). + + // iterate over all racks + for (Iterator>> iter = + rackToBlocks.entrySet().iterator(); iter.hasNext();) { + + Map.Entry> one = iter.next(); + racks.add(one.getKey()); + List blocks = one.getValue(); + + // for each block, copy it into validBlocks. Delete it from + // blockToNodes so that the same block does not appear in + // two different splits. + boolean createdSplit = false; + for (OneBlockInfo oneblock : blocks) { + if (blockToNodes.containsKey(oneblock)) { + validBlocks.add(oneblock); + blockToNodes.remove(oneblock); + curSplitSize += oneblock.length; + + // if the accumulated split size exceeds the maximum, then + // create this split. + if (maxSize != 0 && curSplitSize >= maxSize) { + // create an input split and add it to the splits array + addCreatedSplit(splits, getHosts(racks), validBlocks); + createdSplit = true; + break; + } + } + } + + // if we created a split, then just go to the next rack + if (createdSplit) { + curSplitSize = 0; + validBlocks.clear(); + racks.clear(); + continue; + } + + if (!validBlocks.isEmpty()) { + if (minSizeRack != 0 && curSplitSize >= minSizeRack) { + // if there is a minimum size specified, then create a single split + // otherwise, store these blocks into overflow data structure + addCreatedSplit(splits, getHosts(racks), validBlocks); + } else { + // There were a few blocks in this rack that + // remained to be processed. Keep them in 'overflow' block list. + // These will be combined later. + overflowBlocks.addAll(validBlocks); + } + } + curSplitSize = 0; + validBlocks.clear(); + racks.clear(); + } + } + + assert blockToNodes.isEmpty(); + assert curSplitSize == 0; + assert validBlocks.isEmpty(); + assert racks.isEmpty(); + + // Process all overflow blocks + for (OneBlockInfo oneblock : overflowBlocks) { + validBlocks.add(oneblock); + curSplitSize += oneblock.length; + + // This might cause an exiting rack location to be re-added, + // but it should be ok. + for (int i = 0; i < oneblock.racks.length; i++) { + racks.add(oneblock.racks[i]); + } + + // if the accumulated split size exceeds the maximum, then + // create this split. + if (maxSize != 0 && curSplitSize >= maxSize) { + // create an input split and add it to the splits array + addCreatedSplit(splits, getHosts(racks), validBlocks); + curSplitSize = 0; + validBlocks.clear(); + racks.clear(); + } + } + + // Process any remaining blocks, if any. + if (!validBlocks.isEmpty()) { + addCreatedSplit(splits, getHosts(racks), validBlocks); + } + } + + /** + * Create a single split from the list of blocks specified in validBlocks + * Add this new split into splitList. + */ + private void addCreatedSplit(List splitList, + List locations, + ArrayList validBlocks) { + // create an input split + Path[] fl = new Path[validBlocks.size()]; + long[] offset = new long[validBlocks.size()]; + long[] length = new long[validBlocks.size()]; + for (int i = 0; i < validBlocks.size(); i++) { + fl[i] = validBlocks.get(i).onepath; + offset[i] = validBlocks.get(i).offset; + length[i] = validBlocks.get(i).length; + } + + // add this split to the list that is returned + CombineFileSplit thissplit = new CombineFileSplit(fl, offset, + length, locations.toArray(new String[0])); + splitList.add(thissplit); + } + + /** + * This is not implemented yet. + */ + public abstract RecordReader createRecordReader(InputSplit split, + TaskAttemptContext context) throws IOException; + + /** + * information about one file from the File System + */ + private static class OneFileInfo { + private long fileSize; // size of the file + private OneBlockInfo[] blocks; // all blocks in this file + + OneFileInfo(Path path, Configuration conf, + HashMap> rackToBlocks, + HashMap blockToNodes, + HashMap> nodeToBlocks, + HashMap> rackToNodes) + throws IOException { + this.fileSize = 0; + + // get block locations from file system + FileSystem fs = path.getFileSystem(conf); + FileStatus stat = fs.getFileStatus(path); + BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, + stat.getLen()); + // create a list of all block and their locations + if (locations == null) { + blocks = new OneBlockInfo[0]; + } else { + blocks = new OneBlockInfo[locations.length]; + for (int i = 0; i < locations.length; i++) { + + fileSize += locations[i].getLength(); + OneBlockInfo oneblock = new OneBlockInfo(path, + locations[i].getOffset(), + locations[i].getLength(), + locations[i].getHosts(), + locations[i].getTopologyPaths()); + blocks[i] = oneblock; + + // add this block to the block --> node locations map + blockToNodes.put(oneblock, oneblock.hosts); + + // add this block to the rack --> block map + for (int j = 0; j < oneblock.racks.length; j++) { + String rack = oneblock.racks[j]; + List blklist = rackToBlocks.get(rack); + if (blklist == null) { + blklist = new ArrayList(); + rackToBlocks.put(rack, blklist); + } + blklist.add(oneblock); + // Add this host to rackToNodes map + addHostToRack(rackToNodes, oneblock.racks[j], oneblock.hosts[j]); + } + + // add this block to the node --> block map + for (int j = 0; j < oneblock.hosts.length; j++) { + String node = oneblock.hosts[j]; + List blklist = nodeToBlocks.get(node); + if (blklist == null) { + blklist = new ArrayList(); + nodeToBlocks.put(node, blklist); + } + blklist.add(oneblock); + } + } + } + } + + long getLength() { + return fileSize; + } + + OneBlockInfo[] getBlocks() { + return blocks; + } + } + + /** + * information about one block from the File System + */ + private static class OneBlockInfo { + Path onepath; // name of this file + long offset; // offset in file + long length; // length of this block + String[] hosts; // nodes on which this block resides + String[] racks; // network topology of hosts + + OneBlockInfo(Path path, long offset, long len, + String[] hosts, String[] topologyPaths) { + this.onepath = path; + this.offset = offset; + this.hosts = hosts; + this.length = len; + assert (hosts.length == topologyPaths.length || + topologyPaths.length == 0); + + // if the file system does not have any rack information, then + // use dummy rack location. + if (topologyPaths.length == 0) { + topologyPaths = new String[hosts.length]; + for (int i = 0; i < topologyPaths.length; i++) { + topologyPaths[i] = (new NodeBase(hosts[i], + NetworkTopology.DEFAULT_RACK)).toString(); + } + } + + // The topology paths have the host name included as the last + // component. Strip it. + this.racks = new String[topologyPaths.length]; + for (int i = 0; i < topologyPaths.length; i++) { + this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation(); + } + } + } + + private static void addHostToRack(HashMap> rackToNodes, + String rack, String host) { + Set hosts = rackToNodes.get(rack); + if (hosts == null) { + hosts = new HashSet(); + rackToNodes.put(rack, hosts); + } + hosts.add(host); + } + + private List getHosts(List racks) { + List hosts = new ArrayList(); + for (String rack : racks) { + hosts.addAll(rackToNodes.get(rack)); + } + return hosts; + } + + /** + * Accept a path only if any one of filters given in the + * constructor do. + */ + private static class MultiPathFilter implements PathFilter { + private List filters; + + public MultiPathFilter() { + this.filters = new ArrayList(); + } + + public MultiPathFilter(List filters) { + this.filters = filters; + } + + public void add(PathFilter one) { + filters.add(one); + } + + public boolean accept(Path path) { + for (PathFilter filter : filters) { + if (filter.accept(path)) { + return true; + } + } + return false; + } + + public String toString() { + StringBuffer buf = new StringBuffer(); + buf.append("["); + for (PathFilter f: filters) { + buf.append(f); + buf.append(","); + } + buf.append("]"); + return buf.toString(); + } + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileRawLoaderMapper.java b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileRawLoaderMapper.java new file mode 100644 index 0000000..f6aa599 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileRawLoaderMapper.java @@ -0,0 +1,242 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.mapreduce; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Mapper; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.datasource.JobHistoryRawService; +import com.twitter.hraven.etl.JobFile; + +/** + * Used to read records for the processFile (referring to a JobFile). Reads said + * file into the RAW HBase table. + */ +public class JobFileRawLoaderMapper extends + Mapper { + + private static final ImmutableBytesWritable EMPTY = new ImmutableBytesWritable(); + private static Log LOG = LogFactory.getLog(JobFileRawLoaderMapper.class); + + private long keyCount = 0; + + private boolean forceReprocess = false; + + /** + * Used to read the files from. + */ + private FileSystem hdfs; + + /** + * Job configuration for this job. + */ + private Configuration myConf; + + /** + * Service for storing and retrieving job history and conf blobs. + */ + private JobHistoryRawService rawService = null; + + /** + * @return the key class for the job output data. + */ + public static Class getOutputKeyClass() { + return ImmutableBytesWritable.class; + } + + /** + * @return the value class for the job output data. + */ + public static Class getOutputValueClass() { + return Put.class; + } + + @Override + protected void setup(Context context) throws java.io.IOException, + InterruptedException { + + myConf = context.getConfiguration(); + hdfs = FileSystem.get(myConf); + rawService = new JobHistoryRawService(myConf); + + forceReprocess = myConf.getBoolean(Constants.FORCE_REPROCESS_CONF_KEY, + false); + LOG.info("forceReprocess=" + forceReprocess); + + keyCount = 0; + } + + @Override + protected void map(JobFile jobFile, FileStatus fileStatus, Context context) + throws IOException, InterruptedException { + + boolean exists = hdfs.exists(fileStatus.getPath()); + + if (exists) { + /** + * To collect puts to be passed to the mapper. + */ + List puts = new LinkedList(); + + // Determine if we need to process this file. + if (jobFile.isJobConfFile()) { + keyCount++; + byte[] rowKey = getRowKeyBytes(jobFile); + addFileNamePut(puts, rowKey, Constants.JOBCONF_FILENAME_COL_BYTES, + jobFile.getFilename()); + addRawPut(puts, rowKey, Constants.JOBCONF_COL_BYTES, + Constants.JOBCONF_LAST_MODIFIED_COL_BYTES, fileStatus); + if (forceReprocess) { + // Indicate that we processed the RAW was reloaded so that we can be + // picked up in the new process scan. + Put successPut = rawService.getJobProcessedSuccessPut(rowKey, false); + puts.add(successPut); + } + LOG.info("Loaded conf file (" + keyCount + ") size: " + + fileStatus.getLen() + " = " + jobFile.getFilename()); + } else if (jobFile.isJobHistoryFile()) { + keyCount++; + byte[] rowKey = getRowKeyBytes(jobFile); + // Add filename to be used to re-create JobHistory URL later + addFileNamePut(puts, rowKey, Constants.JOBHISTORY_FILENAME_COL_BYTES, + jobFile.getFilename()); + addRawPut(puts, rowKey, Constants.JOBHISTORY_COL_BYTES, + Constants.JOBHISTORY_LAST_MODIFIED_COL_BYTES, fileStatus); + if (forceReprocess) { + // Indicate that we processed the RAW was reloaded so that we can be + // picked up in the new process scan. + Put successPut = rawService.getJobProcessedSuccessPut(rowKey, false); + puts.add(successPut); + } + LOG.info("Loaded history file (" + keyCount + ") size: " + + fileStatus.getLen() + " = " + jobFile.getFilename()); + } else { + System.out.println("Skipping Key: " + jobFile.getFilename()); + } + + for (Put put : puts) { + // Key is ignored, value is a Put + context.write(EMPTY, put); + } + } else { + // TODO: have better error handling. + System.err.println("Unable to find file: " + fileStatus.getPath()); + } + + }; + + /** + * @param jobFile + * @return the byte representation of the rowkey for the raw table. + */ + private byte[] getRowKeyBytes(JobFile jobFile) { + // This is the cluster for which we are processing files. + String cluster = myConf.get(Constants.CLUSTER_JOB_CONF_KEY); + return rawService.getRowKey(cluster, jobFile.getJobid()); + } + + /** + * @param puts + * to add puts to + * @param rowKey + * for the raw table + * @param filenameColumn + * which filename this is (could be for the jobConf of jobHistory + * file). + * @param filename + * the name of the file. + */ + private void addFileNamePut(List puts, byte[] rowKey, + byte[] filenameColumn, String filename) { + Put put = new Put(rowKey); + put.add(Constants.INFO_FAM_BYTES, filenameColumn, Bytes.toBytes(filename)); + puts.add(put); + } + + /** + * Call {@link #readJobFile(FileStatus)} and add the raw bytes and the last + * modified millis to {@code puts} + * + * @param puts + * to add puts to. + * @rowkey to identify the row in the raw table. + * @param rawColumn + * where to add the raw data in + * @param fileStatus + * Referring to the jobFile to load. + * @throws IOException + */ + private void addRawPut(List puts, byte[] rowKey, byte[] rawColumn, + byte[] lastModificationColumn, FileStatus fileStatus) throws IOException { + byte[] rawBytes = readJobFile(fileStatus); + + Put raw = new Put(rowKey); + + byte[] rawLastModifiedMillis = Bytes.toBytes(fileStatus + .getModificationTime()); + + raw.add(Constants.RAW_FAM_BYTES, rawColumn, rawBytes); + raw.add(Constants.INFO_FAM_BYTES, lastModificationColumn, + rawLastModifiedMillis); + puts.add(raw); + } + + /** + * Get the raw bytes and the last modification millis for this JobFile + * + * @return the contents of the job file. + * @throws IOException + * when bad things happen during reading + */ + private byte[] readJobFile(FileStatus fileStatus) throws IOException { + byte[] rawBytes = null; + FSDataInputStream fsdis = null; + try { + long fileLength = fileStatus.getLen(); + int fileLengthInt = (int) fileLength; + rawBytes = new byte[fileLengthInt]; + fsdis = hdfs.open(fileStatus.getPath()); + IOUtils.readFully(fsdis, rawBytes, 0, fileLengthInt); + } finally { + IOUtils.closeStream(fsdis); + } + return rawBytes; + } + + @Override + protected void cleanup(Context context) throws IOException, + InterruptedException { + if (rawService != null) { + rawService.close(); + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileTableMapper.java b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileTableMapper.java new file mode 100644 index 0000000..fb67e46 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobFileTableMapper.java @@ -0,0 +1,274 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.mapreduce; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.TableMapper; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobHistoryCopy; +import org.apache.hadoop.mapreduce.Mapper; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobDesc; +import com.twitter.hraven.JobDescFactory; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.QualifiedJobId; +import com.twitter.hraven.datasource.AppVersionService; +import com.twitter.hraven.datasource.JobHistoryByIdService; +import com.twitter.hraven.datasource.JobHistoryRawService; +import com.twitter.hraven.datasource.JobHistoryService; +import com.twitter.hraven.datasource.MissingColumnInResultException; +import com.twitter.hraven.datasource.ProcessingException; +import com.twitter.hraven.datasource.RowKeyParseException; +import com.twitter.hraven.etl.ProcessRecordService; + +/** + * Takes in results from a scan from {@link ProcessRecordService + * @getHistoryRawTableScan}, process both job file and history file and emit out + * as puts for the {@link Constants#HISTORY_TABLE} + *

+ * As a side-affect we'll load an index record into the + * {@link Constants#HISTORY_BY_JOBID_TABLE} as well. + * + */ +public class JobFileTableMapper extends + TableMapper { + + private static Log LOG = LogFactory.getLog(JobFileTableMapper.class); + + private static final ImmutableBytesWritable JOB_TABLE = new ImmutableBytesWritable( + Constants.HISTORY_TABLE_BYTES); + private static final ImmutableBytesWritable TASK_TABLE = new ImmutableBytesWritable( + Constants.HISTORY_TASK_TABLE_BYTES); + private static final ImmutableBytesWritable RAW_TABLE = new ImmutableBytesWritable( + Constants.HISTORY_RAW_TABLE_BYTES); + + /** + * Used to create secondary index. + */ + private JobHistoryByIdService jobHistoryByIdService = null; + + /** + * Used to keep track of all the versions of the app we have seen. + */ + private AppVersionService appVersionService = null; + + /** + * Used to store raw blobs of job history and job conf + */ + private JobHistoryRawService rawService = null; + + private long keyCount = 0; + + /** + * @return the key class for the job output data. + */ + public static Class> getOutputKeyClass() { + return ImmutableBytesWritable.class; + } + + /** + * @return the value class for the job output data. + */ + public static Class getOutputValueClass() { + return Put.class; + } + + @Override + protected void setup( + Mapper.Context context) + throws java.io.IOException, InterruptedException { + Configuration myConf = context.getConfiguration(); + jobHistoryByIdService = new JobHistoryByIdService(myConf); + appVersionService = new AppVersionService(myConf); + rawService = new JobHistoryRawService(myConf); + + keyCount = 0; + } + + @Override + protected void map( + ImmutableBytesWritable key, + Result value, + Mapper.Context context) + throws java.io.IOException, InterruptedException { + + keyCount++; + boolean success = true; + QualifiedJobId qualifiedJobId = null; + try { + qualifiedJobId = rawService.getQualifiedJobIdFromResult(value); + context.progress(); + + Configuration jobConf = rawService.createConfigurationFromResult(value); + context.progress(); + + long submitTimeMillis = rawService.getSubmitTimeMillisFromResult(value); + context.progress(); + + Put submitTimePut = rawService.getJobSubmitTimePut(value.getRow(), + submitTimeMillis); + context.write(RAW_TABLE, submitTimePut); + + JobDesc jobDesc = JobDescFactory.createJobDesc(qualifiedJobId, + submitTimeMillis, jobConf); + JobKey jobKey = new JobKey(jobDesc); + context.progress(); + + // TODO: remove sysout + String msg = "JobDesc (" + keyCount + "): " + jobDesc + + " submitTimeMillis: " + submitTimeMillis; + LOG.info(msg); + + List puts = JobHistoryService.getHbasePuts(jobDesc, jobConf); + + LOG.info("Writing " + puts.size() + " JobConf puts to " + + Constants.HISTORY_TABLE); + + // TODO: + // For Scalding just convert the flowID as a Hex number. Use that for the + // runID. + // Then do a post-processing step to re-write scalding flows. Rewrite + // rows. + // Scan should get the first (lowest job-id) then grab the start-time from + // the Job. + + // Emit the puts + for (Put put : puts) { + context.write(JOB_TABLE, put); + context.progress(); + } + + // Write secondary index(es) + LOG.info("Writing secondary indexes"); + jobHistoryByIdService.writeIndexes(jobKey); + context.progress(); + appVersionService.addVersion(jobDesc.getCluster(), jobDesc.getUserName(), + jobDesc.getAppId(), jobDesc.getVersion(), submitTimeMillis); + context.progress(); + + InputStream jobHistoryInputStream = rawService + .getJobHistoryInputStreamFromResult(value); + + JobHistoryListener jobHistoryListener = new JobHistoryListener(jobKey); + + JobHistoryCopy.parseHistoryFromIS(jobHistoryInputStream, + jobHistoryListener); + + puts = jobHistoryListener.getJobPuts(); + LOG.info("Writing " + puts.size() + " Job puts to " + + Constants.HISTORY_TABLE); + + // Emit the puts + for (Put put : puts) { + context.write(JOB_TABLE, put); + // TODO: we should not have to do this, but need to confirm that + // TableRecordWriter does this for us. + context.progress(); + } + + puts = jobHistoryListener.getTaskPuts(); + LOG.info("Writing " + puts.size() + " Job puts to " + + Constants.HISTORY_TASK_TABLE); + + for (Put put : puts) { + context.write(TASK_TABLE, put); + // TODO: we should not have to do this, but need to confirm that + // TableRecordWriter does this for us. + context.progress(); + } + + } catch (RowKeyParseException rkpe) { + LOG.error("Failed to process record " + + (qualifiedJobId != null ? qualifiedJobId.toString() : ""), rkpe); + success = false; + } catch (MissingColumnInResultException mcire) { + LOG.error("Failed to process record " + + (qualifiedJobId != null ? qualifiedJobId.toString() : ""), mcire); + success = false; + } catch (ProcessingException pe) { + LOG.error("Failed to process record " + + (qualifiedJobId != null ? qualifiedJobId.toString() : ""), pe); + success = false; + } + + if (success) { + // Update counter to indicate failure. + context.getCounter(ProcessingCounter.RAW_ROW_SUCCESS_COUNT).increment(1); + } else { + // Update counter to indicate failure. + context.getCounter(ProcessingCounter.RAW_ROW_ERROR_COUNT).increment(1); + } + + // Indicate that we processed the RAW successfully so that we can skip it + // on the next scan (or not). + Put successPut = rawService.getJobProcessedSuccessPut(value.getRow(), + success); + // TODO: In the unlikely event of multiple mappers running against one RAW + // row, with one succeeding and one failing, there could be a race where the + // raw does not properly indicate the true status (which is questionable in + // any case with multiple simultaneous runs with different outcome). + context.write(RAW_TABLE, successPut); + + } + + @Override + protected void cleanup( + Mapper.Context context) + throws java.io.IOException, InterruptedException { + + IOException caught = null; + + if (jobHistoryByIdService != null) { + try { + jobHistoryByIdService.close(); + } catch (IOException ioe) { + caught = ioe; + } + } + if (appVersionService != null) { + try { + appVersionService.close(); + } catch (IOException ioe) { + // TODO: don't overwrite a previous exception + caught = ioe; + } + } + if (rawService != null) { + try { + rawService.close(); + } catch (IOException ioe) { + // TODO: don't overwrite a previous exception + caught = ioe; + } + } + + if (caught != null) { + throw caught; + } + } + +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobHistoryListener.java b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobHistoryListener.java new file mode 100644 index 0000000..f9ca008 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/JobHistoryListener.java @@ -0,0 +1,259 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.mapreduce; + +import java.io.IOException; +import java.text.ParseException; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.JobHistoryCopy; +import org.apache.hadoop.mapred.JobHistoryCopy.Listener; +import org.apache.hadoop.mapred.JobHistoryCopy.RecordTypes; + +import com.twitter.hraven.Constants; +import com.twitter.hraven.JobHistoryKeys; +import com.twitter.hraven.JobKey; +import com.twitter.hraven.TaskKey; +import com.twitter.hraven.datasource.JobKeyConverter; +import com.twitter.hraven.datasource.TaskKeyConverter; +import com.twitter.hraven.etl.ImportException; + + +public class JobHistoryListener implements Listener { + + private static Log LOG = LogFactory.getLog(JobHistoryListener.class); + + private JobKey jobKey; + private String jobId; + /** Job ID, minus the leading "job_" */ + private String jobNumber = ""; + private final byte[] jobKeyBytes; + private List jobPuts = new LinkedList(); + private List taskPuts = new LinkedList(); + private JobKeyConverter jobKeyConv = new JobKeyConverter(); + private TaskKeyConverter taskKeyConv = new TaskKeyConverter(); + + /** + * Constructor for listener to be used to read in a Job History File. While + * reading a list of HBase puts is assembled. + * + * @param jobKey jobKey of the job to be persisted + */ + public JobHistoryListener(JobKey jobKey) { + if (null == jobKey) { + String msg = "JobKey cannot be null"; + LOG.error(msg); + throw new IllegalArgumentException(msg); + } + this.jobKey = jobKey; + this.jobKeyBytes = jobKeyConv.toBytes(jobKey); + setJobId(jobKey.getJobId().getJobIdString()); + } + + @Override + public void handle(RecordTypes recType, Map values) + throws IOException { + switch (recType) { + case Job: + handleJob(values); + break; + case Task: + handleTask(values); + break; + case MapAttempt: + handleMapAttempt(values); + break; + case ReduceAttempt: + handleReduceAttempt(values); + break; + default: + // skip other types + ; + } + //System.out.println("Reading: " + recType.toString()); + } + + private void handleJob(Map values) { + String id = values.get(JobHistoryKeys.JOBID); + + if (jobId == null) { + setJobId(id); + } else if (!jobId.equals(id)) { + String msg = "Current job ID '" + id + + "' does not match previously stored value '" + jobId + "'"; + LOG.error(msg); + throw new ImportException(msg); + } + // add job ID to values to put + Put p = new Put(this.jobKeyBytes); + for (Map.Entry e : values.entrySet()) { + addKeyValues(p, Constants.INFO_FAM_BYTES, e.getKey(), e.getValue()); + } + this.jobPuts.add(p); + } + + private void handleTask(Map values) { + byte[] taskIdKeyBytes = getTaskKey("task_", this.jobNumber, values.get(JobHistoryKeys.TASKID)); + Put p = new Put(taskIdKeyBytes); + + p.add(Constants.INFO_FAM_BYTES, Constants.RECORD_TYPE_COL_BYTES, + Bytes.toBytes(RecordTypes.Task.toString())); + for (Map.Entry e : values.entrySet()) { + addKeyValues(p, Constants.INFO_FAM_BYTES, e.getKey(), e.getValue()); + } + this.taskPuts.add(p); + } + + private void handleMapAttempt(Map values) { + byte[] taskIdKeyBytes = getTaskKey("attempt_", this.jobNumber, values.get(JobHistoryKeys.TASK_ATTEMPT_ID)); + Put p = new Put(taskIdKeyBytes); + + p.add(Constants.INFO_FAM_BYTES, Constants.RECORD_TYPE_COL_BYTES, + Bytes.toBytes(RecordTypes.MapAttempt.toString())); + for (Map.Entry e : values.entrySet()) { + addKeyValues(p, Constants.INFO_FAM_BYTES, e.getKey(), e.getValue()); + } + + this.taskPuts.add(p); + } + + private void handleReduceAttempt(Map values) { + byte[] taskIdKeyBytes = getTaskKey("attempt_", this.jobNumber, values.get(JobHistoryKeys.TASK_ATTEMPT_ID)); + Put p = new Put(taskIdKeyBytes); + + p.add(Constants.INFO_FAM_BYTES, Constants.RECORD_TYPE_COL_BYTES, + Bytes.toBytes(RecordTypes.ReduceAttempt.toString())); + for (Map.Entry e : values.entrySet()) { + addKeyValues(p, Constants.INFO_FAM_BYTES, e.getKey(), e.getValue()); + } + + this.taskPuts.add(p); + } + + private void addKeyValues(Put p, byte[] family, JobHistoryKeys key, String value) { + if (key == JobHistoryKeys.COUNTERS || key == JobHistoryKeys.MAP_COUNTERS + || key == JobHistoryKeys.REDUCE_COUNTERS) { + try { + Counters counters = Counters.fromEscapedCompactString(value); + /* + * Name counter columns as: + * g!groupname!countername + */ + byte[] counterPrefix = null; + if (key == JobHistoryKeys.COUNTERS) { + counterPrefix = Bytes.add(Constants.COUNTER_COLUMN_PREFIX_BYTES, + Constants.SEP_BYTES); + } else if (key == JobHistoryKeys.MAP_COUNTERS) { + counterPrefix = Bytes.add(Constants.MAP_COUNTER_COLUMN_PREFIX_BYTES, + Constants.SEP_BYTES); + } else if (key == JobHistoryKeys.REDUCE_COUNTERS) { + counterPrefix = Bytes.add(Constants.REDUCE_COUNTER_COLUMN_PREFIX_BYTES, + Constants.SEP_BYTES); + } else { + throw new IllegalArgumentException("Unknown counter type "+key.toString()); + } + + for (Counters.Group group : counters) { + byte[] groupPrefix = Bytes.add( + counterPrefix, Bytes.toBytes(group.getName()), Constants.SEP_BYTES); + for (Counters.Counter counter : group) { + byte[] qualifier = Bytes.add(groupPrefix, Bytes.toBytes(counter.getName())); + p.add(family, qualifier, Bytes.toBytes(counter.getValue())); + } + } + } catch (ParseException pe) { + LOG.error("Counters could not be parsed from string'"+value+"'", pe); + } + } else { + @SuppressWarnings("rawtypes") + Class clazz = JobHistoryKeys.KEY_TYPES.get(key); + byte[] valueBytes = null; + if (Integer.class.equals(clazz)) { + try { + valueBytes = (value != null && value.trim().length() > 0) ? + Bytes.toBytes(Integer.parseInt(value)) : Constants.ZERO_INT_BYTES; + } catch (NumberFormatException nfe) { + // us a default value + valueBytes = Constants.ZERO_INT_BYTES; + } + } else if (Long.class.equals(clazz)) { + try { + valueBytes = (value != null && value.trim().length() > 0) ? + Bytes.toBytes(Long.parseLong(value)) : Constants.ZERO_LONG_BYTES; + } catch (NumberFormatException nfe) { + // us a default value + valueBytes = Constants.ZERO_LONG_BYTES; + } + } else { + // keep the string representation by default + valueBytes = Bytes.toBytes(value); + } + byte[] qualifier = Bytes.toBytes(key.toString().toLowerCase()); + p.add(family, qualifier, valueBytes); + } + } + + /** + * Sets the job ID and strips out the job number (job ID minus the "job_" prefix). + * @param id + */ + private void setJobId(String id) { + this.jobId = id; + if (id != null && id.startsWith("job_") && id.length() > 4) { + this.jobNumber = id.substring(4); + } + } + + /** + * Returns the Task ID or Task Attempt ID, stripped of the leading job ID, + * appended to the job row key. + */ + public byte[] getTaskKey(String prefix, String jobNumber, String fullId) { + String taskComponent = fullId; + if (fullId == null) { + taskComponent = ""; + } else { + String expectedPrefix = prefix + jobNumber + "_"; + if (fullId.startsWith(expectedPrefix) + && fullId.length() > expectedPrefix.length()) { + taskComponent = fullId.substring(expectedPrefix.length()); + } + } + + return taskKeyConv.toBytes(new TaskKey(this.jobKey, taskComponent)); + } + + /** + * Return the generated list of put assembled when + * {@link JobHistoryCopy#parseHistoryFromFS(String, Listener, org.apache.hadoop.fs.FileSystem)} + * is called with this listener. + * @return a non-null (possibly empty) list of jobPuts + */ + public List getJobPuts() { + return this.jobPuts; + } + + public List getTaskPuts() { + return this.taskPuts; + } +} diff --git a/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/ProcessingCounter.java b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/ProcessingCounter.java new file mode 100644 index 0000000..bebb753 --- /dev/null +++ b/hraven-etl/src/main/java/com/twitter/hraven/mapreduce/ProcessingCounter.java @@ -0,0 +1,30 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.mapreduce; + +public enum ProcessingCounter { + + /** + * Indicating how many raw records (jobs) could not be processed successfully. + */ + RAW_ROW_ERROR_COUNT, + + /** + * Indicating how many raw records (jobs) could not be processed successfully. + */ + RAW_ROW_SUCCESS_COUNT; + +} diff --git a/hraven-etl/src/main/java/org/apache/hadoop/mapred/JobHistoryCopy.java b/hraven-etl/src/main/java/org/apache/hadoop/mapred/JobHistoryCopy.java new file mode 100644 index 0000000..76da0fb --- /dev/null +++ b/hraven-etl/src/main/java/org/apache/hadoop/mapred/JobHistoryCopy.java @@ -0,0 +1,2163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapred; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.mapreduce.JobACL; +import org.apache.hadoop.security.authorize.AccessControlList; +import org.apache.hadoop.util.StringUtils; + +import com.twitter.hraven.JobHistoryKeys; + +/** + * TODO: ** HACK-ALERT ! ** + * Note that this is a copy of the original JobHistory class. + * Reason is that the class needs to be modified, yet needs access to + * package-private methods on the JobTracker class. + * Hence the same package name. + * Once this works, we'll submit upstream fixes to this class and + * port that back to our distro. + * + * Changes that we have made: + * 1) Constructor added with input stream + * 2) Keys enum refactored in standalone {@link JobHistoryKeys} class + * as it is needed outside of this context. + * TODO: ** End of HACK-ALERT ** + * + * Provides methods for writing to and reading from job history. + * Job History works in an append mode, JobHistory and its inner classes provide methods + * to log job events. + * + * JobHistory is split into multiple files, format of each file is plain text where each line + * is of the format [type (key=value)*], where type identifies the type of the record. + * Type maps to UID of one of the inner classes of this class. + * + * Job history is maintained in a master index which contains star/stop times of all jobs with + * a few other job level properties. Apart from this each job's history is maintained in a seperate history + * file. name of job history files follows the format jobtrackerId_jobid + * + * For parsing the job history it supports a listener based interface where each line is parsed + * and passed to listener. The listener can create an object model of history or look for specific + * events and discard rest of the history. + * + * CHANGE LOG : + * Version 0 : The history has the following format : + * TAG KEY1="VALUE1" KEY2="VALUE2" and so on. + TAG can be Job, Task, MapAttempt or ReduceAttempt. + Note that a '"' is the line delimiter. + * Version 1 : Changes the line delimiter to '.' + Values are now escaped for unambiguous parsing. + Added the Meta tag to store version info. + */ +public class JobHistoryCopy { + + static final long VERSION = 1L; + public static final Log LOG = LogFactory.getLog(JobHistoryCopy.class); + private static final char DELIMITER = ' '; + static final char LINE_DELIMITER_CHAR = '.'; + static final char[] charsToEscape = new char[] {'"', '=', + LINE_DELIMITER_CHAR}; + static final String DIGITS = "[0-9]+"; + + static final String KEY = "(\\w+)"; + // value is any character other than quote, but escaped quotes can be there + static final String VALUE = "[^\"\\\\]*+(?:\\\\.[^\"\\\\]*+)*+"; + + static final Pattern pattern = Pattern.compile(KEY + "=" + "\"" + VALUE + "\""); + + public static final int JOB_NAME_TRIM_LENGTH = 50; + private static String JOBTRACKER_UNIQUE_STRING = null; + private static String LOG_DIR = null; + private static final String SECONDARY_FILE_SUFFIX = ".recover"; + private static long jobHistoryBlockSize = 0; + private static String jobtrackerHostname; + private static JobHistoryFilesManager fileManager = null; + final static FsPermission HISTORY_DIR_PERMISSION = + FsPermission.createImmutable((short) 0755); // rwxr-xr-x + final static FsPermission HISTORY_FILE_PERMISSION = + FsPermission.createImmutable((short) 0744); // rwxr--r-- + private static FileSystem LOGDIR_FS; // log dir filesystem + private static FileSystem DONEDIR_FS; // Done dir filesystem + private static JobConf jtConf; + private static Path DONE = null; // folder for completed jobs + private static boolean aclsEnabled = false; + /** + * A filter for conf files + */ + private static final PathFilter CONF_FILTER = new PathFilter() { + public boolean accept(Path path) { + return path.getName().endsWith("_conf.xml"); + } + }; + + private static Map jobHistoryFileMap = + Collections.synchronizedMap( + new LinkedHashMap()); + + private static class MovedFileInfo { + private final String historyFile; + private final long timestamp; + public MovedFileInfo(String historyFile, long timestamp) { + this.historyFile = historyFile; + this.timestamp = timestamp; + } + } + + /** + * Given the job id, return the history file path from the cache + */ + public static String getHistoryFilePath(JobID jobId) { + MovedFileInfo info = jobHistoryFileMap.get(jobId); + if (info == null) { + return null; + } + return info.historyFile; + } + + /** + * A class that manages all the files related to a job. For now + * - writers : list of open files + * - job history filename + * - job conf filename + */ + private static class JobHistoryFilesManager { + // a private (virtual) folder for all the files related to a running job + private static class FilesHolder { + ArrayList writers = new ArrayList(); + Path historyFilename; // path of job history file + Path confFilename; // path of job's conf + } + + private ThreadPoolExecutor executor = null; + private final Configuration conf; + private final JobTracker jobTracker; + + // cache from job-key to files associated with it. + private Map fileCache = + new ConcurrentHashMap(); + + JobHistoryFilesManager(Configuration conf, JobTracker jobTracker) + throws IOException { + this.conf = conf; + this.jobTracker = jobTracker; + } + + + void start() { + executor = new ThreadPoolExecutor(1, 3, 1, + TimeUnit.HOURS, new LinkedBlockingQueue()); + } + + private FilesHolder getFileHolder(JobID id) { + FilesHolder holder = fileCache.get(id); + if (holder == null) { + holder = new FilesHolder(); + fileCache.put(id, holder); + } + return holder; + } + + void addWriter(JobID id, PrintWriter writer) { + FilesHolder holder = getFileHolder(id); + holder.writers.add(writer); + } + + void setHistoryFile(JobID id, Path file) { + FilesHolder holder = getFileHolder(id); + holder.historyFilename = file; + } + + void setConfFile(JobID id, Path file) { + FilesHolder holder = getFileHolder(id); + holder.confFilename = file; + } + + ArrayList getWriters(JobID id) { + FilesHolder holder = fileCache.get(id); + return holder == null ? null : holder.writers; + } + + Path getHistoryFile(JobID id) { + FilesHolder holder = fileCache.get(id); + return holder == null ? null : holder.historyFilename; + } + + Path getConfFileWriters(JobID id) { + FilesHolder holder = fileCache.get(id); + return holder == null ? null : holder.confFilename; + } + + void purgeJob(JobID id) { + fileCache.remove(id); + } + + void moveToDone(final JobID id) { + final List paths = new ArrayList(); + final Path historyFile = fileManager.getHistoryFile(id); + if (historyFile == null) { + LOG.info("No file for job-history with " + id + " found in cache!"); + } else { + paths.add(historyFile); + } + + final Path confPath = fileManager.getConfFileWriters(id); + if (confPath == null) { + LOG.info("No file for jobconf with " + id + " found in cache!"); + } else { + paths.add(confPath); + } + + executor.execute(new Runnable() { + + public void run() { + //move the files to DONE folder + try { + for (Path path : paths) { + //check if path exists, in case of retries it may not exist + if (LOGDIR_FS.exists(path)) { + LOG.info("Moving " + path.toString() + " to " + + DONE.toString()); + DONEDIR_FS.moveFromLocalFile(path, DONE); + DONEDIR_FS.setPermission(new Path(DONE, path.getName()), + new FsPermission(HISTORY_FILE_PERMISSION)); + } + } + } catch (Throwable e) { + LOG.error("Unable to move history file to DONE folder.", e); + } + String historyFileDonePath = null; + if (historyFile != null) { + historyFileDonePath = new Path(DONE, + historyFile.getName()).toString(); + } + + jobHistoryFileMap.put(id, new MovedFileInfo(historyFileDonePath, + System.currentTimeMillis())); + jobTracker.historyFileCopied(id, historyFileDonePath); + + //purge the job from the cache + fileManager.purgeJob(id); + } + + }); + } + + void removeWriter(JobID jobId, PrintWriter writer) { + fileManager.getWriters(jobId).remove(writer); + } + } + /** + * Record types are identifiers for each line of log in history files. + * A record type appears as the first token in a single line of log. + */ + public static enum RecordTypes { + Jobtracker, Job, Task, MapAttempt, ReduceAttempt, Meta + } + + /** + * This enum contains some of the values commonly used by history log events. + * since values in history can only be strings - Values.name() is used in + * most places in history file. + */ + public static enum Values { + SUCCESS, FAILED, KILLED, MAP, REDUCE, CLEANUP, RUNNING, PREP, SETUP + } + + /** + * Initialize JobHistory files. + * @param conf Jobconf of the job tracker. + * @param hostname jobtracker's hostname + * @param jobTrackerStartTime jobtracker's start time + * @return true if intialized properly + * false otherwise + */ + public static void init(JobTracker jobTracker, JobConf conf, + String hostname, long jobTrackerStartTime) throws IOException { + LOG_DIR = conf.get("hadoop.job.history.location" , + "file:///" + new File( + System.getProperty("hadoop.log.dir")).getAbsolutePath() + + File.separator + "history"); + JOBTRACKER_UNIQUE_STRING = hostname + "_" + + String.valueOf(jobTrackerStartTime) + "_"; + jobtrackerHostname = hostname; + Path logDir = new Path(LOG_DIR); + LOGDIR_FS = logDir.getFileSystem(conf); + if (!LOGDIR_FS.exists(logDir)){ + if (!LOGDIR_FS.mkdirs(logDir, new FsPermission(HISTORY_DIR_PERMISSION))) { + throw new IOException("Mkdirs failed to create " + logDir.toString()); + } + } + conf.set("hadoop.job.history.location", LOG_DIR); + // set the job history block size (default is 3MB) + jobHistoryBlockSize = + conf.getLong("mapred.jobtracker.job.history.block.size", + 3 * 1024 * 1024); + jtConf = conf; + + // queue and job level security is enabled on the mapreduce cluster or not + aclsEnabled = conf.getBoolean(JobConf.MR_ACLS_ENABLED, false); + + // initialize the file manager + fileManager = new JobHistoryFilesManager(conf, jobTracker); + } + + static void initDone(JobConf conf, FileSystem fs) throws IOException { + //if completed job history location is set, use that + String doneLocation = conf. + get("mapred.job.tracker.history.completed.location"); + if (doneLocation != null) { + Path donePath = new Path(doneLocation); + DONEDIR_FS = donePath.getFileSystem(conf); + DONE = DONEDIR_FS.makeQualified(donePath); + } else { + DONE = new Path(LOG_DIR, "done"); + DONEDIR_FS = LOGDIR_FS; + } + + //If not already present create the done folder with appropriate + //permission + if (!DONEDIR_FS.exists(DONE)) { + LOG.info("Creating DONE folder at "+ DONE); + if (! DONEDIR_FS.mkdirs(DONE, + new FsPermission(HISTORY_DIR_PERMISSION))) { + throw new IOException("Mkdirs failed to create " + DONE.toString()); + } + } + fileManager.start(); + } + + + /** + * Manages job-history's meta information such as version etc. + * Helps in logging version information to the job-history and recover + * version information from the history. + */ + static class MetaInfoManager implements Listener { + private long version = 0L; + private KeyValuePair pairs = new KeyValuePair(); + + // Extract the version of the history that was used to write the history + public MetaInfoManager(String line) throws IOException { + if (null != line) { + // Parse the line + parseLine(line, this, false); + } + } + + // Get the line delimiter + char getLineDelim() { + if (version == 0) { + return '"'; + } else { + return LINE_DELIMITER_CHAR; + } + } + + // Checks if the values are escaped or not + boolean isValueEscaped() { + // Note that the values are not escaped in version 0 + return version != 0; + } + + public void handle(RecordTypes recType, Map values) + throws IOException { + // Check if the record is of type META + if (RecordTypes.Meta == recType) { + pairs.handle(values); + version = pairs.getLong(JobHistoryKeys.VERSION); // defaults to 0 + } + } + + /** + * Logs history meta-info to the history file. This needs to be called once + * per history file. + * @param jobId job id, assigned by jobtracker. + */ + static void logMetaInfo(ArrayList writers){ + if (null != writers){ + JobHistoryCopy.log(writers, RecordTypes.Meta, + new JobHistoryKeys[] {JobHistoryKeys.VERSION}, + new String[] {String.valueOf(VERSION)}); + } + } + } + + /** Escapes the string especially for {@link JobHistoryCopy} + */ + static String escapeString(String data) { + return StringUtils.escapeString(data, StringUtils.ESCAPE_CHAR, + charsToEscape); + } + + + /** + * Parses history file and invokes Listener.handle() for + * each line of history. It can be used for looking through history + * files for specific items without having to keep whole history in memory. + * @param path path to history file + * @param l Listener for history events + * @param fs FileSystem where history file is present + * @throws IOException + */ + public static void parseHistoryFromFS(String path, Listener l, FileSystem fs) + throws IOException{ + FSDataInputStream in = fs.open(new Path(path)); + parseHistoryFromIS(in, l); + } + + + + /** + * Parses history (file) input stream and invokes Listener.handle() for + * each line of history. It can be used for looking through history + * (file) input streams for specific items without having to keep whole history in memory. + * @param in containing history file content + * @param l Listener for history events + * @throws IOException + */ + public static void parseHistoryFromIS(InputStream in, Listener l) + throws IOException{ + BufferedReader reader = new BufferedReader(new InputStreamReader (in)); + try { + String line = null; + StringBuffer buf = new StringBuffer(); + + // Read the meta-info line. Note that this might a jobinfo line for files + // written with older format + line = reader.readLine(); + + // Check if the file is empty + if (line == null) { + return; + } + + // Get the information required for further processing + MetaInfoManager mgr = new MetaInfoManager(line); + boolean isEscaped = mgr.isValueEscaped(); + String lineDelim = String.valueOf(mgr.getLineDelim()); + String escapedLineDelim = + StringUtils.escapeString(lineDelim, StringUtils.ESCAPE_CHAR, + mgr.getLineDelim()); + + do { + buf.append(line); + if (!line.trim().endsWith(lineDelim) + || line.trim().endsWith(escapedLineDelim)) { + buf.append("\n"); + continue; + } + parseLine(buf.toString(), l, isEscaped); + buf = new StringBuffer(); + } while ((line = reader.readLine())!= null); + } finally { + try { reader.close(); } catch (IOException ex) {} + } + } + + /** + * Parse a single line of history. + * @param line + * @param l + * @throws IOException + */ + private static void parseLine(String line, Listener l, boolean isEscaped) + throws IOException{ + // extract the record type + int idx = line.indexOf(' '); + String recType = line.substring(0, idx); + String data = line.substring(idx+1, line.length()); + + Matcher matcher = pattern.matcher(data); + Map parseBuffer = new HashMap(); + + while(matcher.find()){ + String tuple = matcher.group(0); + String []parts = StringUtils.split(tuple, StringUtils.ESCAPE_CHAR, '='); + String value = parts[1].substring(1, parts[1].length() -1); + if (isEscaped) { + value = StringUtils.unEscapeString(value, StringUtils.ESCAPE_CHAR, + charsToEscape); + } + parseBuffer.put(JobHistoryKeys.valueOf(parts[0]), value); + } + + l.handle(RecordTypes.valueOf(recType), parseBuffer); + + parseBuffer.clear(); + } + + + /** + * Log a raw record type with keys and values. This is method is generally not used directly. + * @param recordType type of log event + * @param key key + * @param value value + */ + + static void log(PrintWriter out, RecordTypes recordType, JobHistoryKeys key, + String value){ + value = escapeString(value); + out.println(recordType.name() + DELIMITER + key + "=\"" + value + "\"" + + DELIMITER + LINE_DELIMITER_CHAR); + } + + /** + * Log a number of keys and values with record. the array length of keys and values + * should be same. + * @param recordType type of log event + * @param keys type of log event + * @param values type of log event + */ + + /** + * Log a number of keys and values with record. the array length of keys and values + * should be same. + * @param recordType type of log event + * @param keys type of log event + * @param values type of log event + */ + + static void log(ArrayList writers, RecordTypes recordType, + JobHistoryKeys[] keys, String[] values) { + log(writers, recordType, keys, values, null); + } + + /** + * Log a number of keys and values with record. the array length of keys and values + * should be same. + * @param recordType type of log event + * @param keys type of log event + * @param values type of log event + * @param JobID jobid of the job + */ + + static void log(ArrayList writers, RecordTypes recordType, + JobHistoryKeys[] keys, String[] values, JobID id) { + + // First up calculate the length of buffer, so that we are performant + // enough. + int length = recordType.name().length() + keys.length * 4 + 2; + for (int i = 0; i < keys.length; i++) { + values[i] = escapeString(values[i]); + length += values[i].length() + keys[i].toString().length(); + } + + // We have the length of the buffer, now construct it. + StringBuilder builder = new StringBuilder(length); + builder.append(recordType.name()); + builder.append(DELIMITER); + for(int i =0; i< keys.length; i++){ + builder.append(keys[i]); + builder.append("=\""); + builder.append(values[i]); + builder.append("\""); + builder.append(DELIMITER); + } + builder.append(LINE_DELIMITER_CHAR); + + for (Iterator iter = writers.iterator(); iter.hasNext();) { + PrintWriter out = iter.next(); + out.println(builder.toString()); + if (out.checkError() && id != null) { + LOG.info("Logging failed for job " + id + "removing PrintWriter from FileManager"); + iter.remove(); + } + } + } + + /** + * Get the history location + */ + static Path getJobHistoryLocation() { + return new Path(LOG_DIR); + } + + /** + * Get the history location for completed jobs + */ + static Path getCompletedJobHistoryLocation() { + return DONE; + } + + /** + * Base class contais utility stuff to manage types key value pairs with enums. + */ + static class KeyValuePair{ + private Map values = new HashMap(); + + /** + * Get 'String' value for given key. Most of the places use Strings as + * values so the default get' method returns 'String'. This method never returns + * null to ease on GUIs. if no value is found it returns empty string "" + * @param k + * @return if null it returns empty string - "" + */ + public String get(JobHistoryKeys k){ + String s = values.get(k); + return s == null ? "" : s; + } + /** + * Convert value from history to int and return. + * if no value is found it returns 0. + * @param k key + */ + public int getInt(JobHistoryKeys k){ + String s = values.get(k); + if (null != s){ + return Integer.parseInt(s); + } + return 0; + } + /** + * Convert value from history to int and return. + * if no value is found it returns 0. + * @param k + */ + public long getLong(JobHistoryKeys k){ + String s = values.get(k); + if (null != s){ + return Long.parseLong(s); + } + return 0; + } + /** + * Set value for the key. + * @param k + * @param s + */ + public void set(JobHistoryKeys k, String s){ + values.put(k, s); + } + /** + * Adds all values in the Map argument to its own values. + * @param m + */ + public void set(Map m){ + values.putAll(m); + } + /** + * Reads values back from the history, input is same Map as passed to Listener by parseHistory(). + * @param values + */ + public synchronized void handle(Map values){ + set(values); + } + /** + * Returns Map containing all key-values. + */ + public Map getValues(){ + return values; + } + } + + /** + * Helper class for logging or reading back events related to job start, finish or failure. + */ + public static class JobInfo extends KeyValuePair{ + + private Map allTasks = new TreeMap(); + private Map jobACLs = + new HashMap(); + private String queueName = null;// queue to which this job was submitted to + + /** Create new JobInfo */ + public JobInfo(String jobId){ + set(JobHistoryKeys.JOBID, jobId); + } + + /** + * Returns all map and reduce tasks . + */ + public Map getAllTasks() { return allTasks; } + + /** + * Get the job acls. + * + * @return a {@link Map} from {@link JobACL} to {@link AccessControlList} + */ + public Map getJobACLs() { + return jobACLs; + } + + @Override + public synchronized void handle(Map values) { + if (values.containsKey(JobHistoryKeys.SUBMIT_TIME)) {// job submission + // construct the job ACLs + String viewJobACL = values.get(JobHistoryKeys.VIEW_JOB); + String modifyJobACL = values.get(JobHistoryKeys.MODIFY_JOB); + if (viewJobACL != null) { + jobACLs.put(JobACL.VIEW_JOB, new AccessControlList(viewJobACL)); + } + if (modifyJobACL != null) { + jobACLs.put(JobACL.MODIFY_JOB, new AccessControlList(modifyJobACL)); + } + // get the job queue name + queueName = values.get(JobHistoryKeys.JOB_QUEUE); + } + super.handle(values); + } + + String getJobQueue() { + return queueName; + } + + /** + * Get the path of the locally stored job file + * @param jobId id of the job + * @return the path of the job file on the local file system + */ + public static String getLocalJobFilePath(JobID jobId){ + return System.getProperty("hadoop.log.dir") + File.separator + + jobId + "_conf.xml"; + } + + /** + * Helper function to encode the URL of the path of the job-history + * log file. + * + * @param logFile path of the job-history file + * @return URL encoded path + * @throws IOException + */ + public static String encodeJobHistoryFilePath(String logFile) + throws IOException { + Path rawPath = new Path(logFile); + String encodedFileName = null; + try { + encodedFileName = URLEncoder.encode(rawPath.getName(), "UTF-8"); + } catch (UnsupportedEncodingException uee) { + IOException ioe = new IOException(); + ioe.initCause(uee); + ioe.setStackTrace(uee.getStackTrace()); + throw ioe; + } + + Path encodedPath = new Path(rawPath.getParent(), encodedFileName); + return encodedPath.toString(); + } + + /** + * Helper function to encode the URL of the filename of the job-history + * log file. + * + * @param logFileName file name of the job-history file + * @return URL encoded filename + * @throws IOException + */ + public static String encodeJobHistoryFileName(String logFileName) + throws IOException { + String encodedFileName = null; + try { + encodedFileName = URLEncoder.encode(logFileName, "UTF-8"); + } catch (UnsupportedEncodingException uee) { + IOException ioe = new IOException(); + ioe.initCause(uee); + ioe.setStackTrace(uee.getStackTrace()); + throw ioe; + } + return encodedFileName; + } + + /** + * Helper function to decode the URL of the filename of the job-history + * log file. + * + * @param logFileName file name of the job-history file + * @return URL decoded filename + * @throws IOException + */ + public static String decodeJobHistoryFileName(String logFileName) + throws IOException { + String decodedFileName = null; + try { + decodedFileName = URLDecoder.decode(logFileName, "UTF-8"); + } catch (UnsupportedEncodingException uee) { + IOException ioe = new IOException(); + ioe.initCause(uee); + ioe.setStackTrace(uee.getStackTrace()); + throw ioe; + } + return decodedFileName; + } + + /** + * Get the job name from the job conf + */ + static String getJobName(JobConf jobConf) { + String jobName = jobConf.getJobName(); + if (jobName == null || jobName.length() == 0) { + jobName = "NA"; + } + return jobName; + } + + /** + * Get the user name from the job conf + */ + public static String getUserName(JobConf jobConf) { + String user = jobConf.getUser(); + if (user == null || user.length() == 0) { + user = "NA"; + } + return user; + } + + /** + * Get the job history file path given the history filename + */ + public static Path getJobHistoryLogLocation(String logFileName) + { + return LOG_DIR == null ? null : new Path(LOG_DIR, logFileName); + } + + /** + * Get the user job history file path + */ + public static Path getJobHistoryLogLocationForUser(String logFileName, + JobConf jobConf) { + // find user log directory + Path userLogFile = null; + Path outputPath = FileOutputFormat.getOutputPath(jobConf); + String userLogDir = jobConf.get("hadoop.job.history.user.location", + outputPath == null + ? null + : outputPath.toString()); + if ("none".equals(userLogDir)) { + userLogDir = null; + } + if (userLogDir != null) { + userLogDir = userLogDir + Path.SEPARATOR + "_logs" + Path.SEPARATOR + + "history"; + userLogFile = new Path(userLogDir, logFileName); + } + return userLogFile; + } + + /** + * Generates the job history filename for a new job + */ + private static String getNewJobHistoryFileName(JobConf jobConf, JobID id) { + return JOBTRACKER_UNIQUE_STRING + + id.toString() + "_" + + getUserName(jobConf) + + "_" + + trimJobName(getJobName(jobConf)); + } + + /** + * Trims the job-name if required + */ + private static String trimJobName(String jobName) { + if (jobName.length() > JOB_NAME_TRIM_LENGTH) { + jobName = jobName.substring(0, JOB_NAME_TRIM_LENGTH); + } + return jobName; + } + + private static String escapeRegexChars( String string ) { + return "\\Q"+string.replaceAll("\\\\E", "\\\\E\\\\\\\\E\\\\Q")+"\\E"; + } + + /** + * Recover the job history filename from the history folder. + * Uses the following pattern + * $jt-hostname_[0-9]*_$job-id_$user-$job-name* + * @param jobConf the job conf + * @param id job id + */ + public static synchronized String getJobHistoryFileName(JobConf jobConf, + JobID id) + throws IOException { + return getJobHistoryFileName(jobConf, id, new Path(LOG_DIR), LOGDIR_FS); + } + + static synchronized String getDoneJobHistoryFileName(JobConf jobConf, + JobID id) throws IOException { + if (DONE == null) { + return null; + } + return getJobHistoryFileName(jobConf, id, DONE, DONEDIR_FS); + } + + /** + * @param dir The directory where to search. + */ + private static synchronized String getJobHistoryFileName(JobConf jobConf, + JobID id, Path dir, FileSystem fs) + throws IOException { + String user = getUserName(jobConf); + String jobName = trimJobName(getJobName(jobConf)); + if (LOG_DIR == null) { + return null; + } + + // Make the pattern matching the job's history file + final Pattern historyFilePattern = + Pattern.compile(jobtrackerHostname + "_" + DIGITS + "_" + + id.toString() + "_" + user + "_" + + escapeRegexChars(jobName) + "+"); + // a path filter that matches 4 parts of the filenames namely + // - jt-hostname + // - job-id + // - username + // - jobname + PathFilter filter = new PathFilter() { + public boolean accept(Path path) { + String fileName = path.getName(); + try { + fileName = decodeJobHistoryFileName(fileName); + } catch (IOException ioe) { + LOG.info("Error while decoding history file " + fileName + "." + + " Ignoring file.", ioe); + return false; + } + return historyFilePattern.matcher(fileName).find(); + } + }; + + FileStatus[] statuses = fs.listStatus(dir, filter); + String filename = null; + if (statuses.length == 0) { + LOG.info("Nothing to recover for job " + id); + } else { + // return filename considering that fact the name can be a + // secondary filename like filename.recover + filename = getPrimaryFilename(statuses[0].getPath().getName(), jobName); + LOG.info("Recovered job history filename for job " + id + " is " + + filename); + } + return filename; + } + + // removes all extra extensions from a filename and returns the core/primary + // filename + private static String getPrimaryFilename(String filename, String jobName) + throws IOException{ + filename = decodeJobHistoryFileName(filename); + // Remove the '.recover' suffix if it exists + if (filename.endsWith(jobName + SECONDARY_FILE_SUFFIX)) { + int newLength = filename.length() - SECONDARY_FILE_SUFFIX.length(); + filename = filename.substring(0, newLength); + } + return encodeJobHistoryFileName(filename); + } + + /** Since there was a restart, there should be a master file and + * a recovery file. Once the recovery is complete, the master should be + * deleted as an indication that the recovery file should be treated as the + * master upon completion or next restart. + * @param fileName the history filename that needs checkpointing + * @param conf Job conf + * @throws IOException + */ + static synchronized void checkpointRecovery(String fileName, JobConf conf) + throws IOException { + Path logPath = JobHistoryCopy.JobInfo.getJobHistoryLogLocation(fileName); + if (logPath != null) { + LOG.info("Deleting job history file " + logPath.getName()); + LOGDIR_FS.delete(logPath, false); + } + // do the same for the user file too + logPath = JobHistoryCopy.JobInfo.getJobHistoryLogLocationForUser(fileName, + conf); + if (logPath != null) { + FileSystem fs = logPath.getFileSystem(conf); + fs.delete(logPath, false); + } + } + + static String getSecondaryJobHistoryFile(String filename) + throws IOException { + return encodeJobHistoryFileName( + decodeJobHistoryFileName(filename) + SECONDARY_FILE_SUFFIX); + } + + /** Selects one of the two files generated as a part of recovery. + * The thumb rule is that always select the oldest file. + * This call makes sure that only one file is left in the end. + * @param conf job conf + * @param logFilePath Path of the log file + * @throws IOException + */ + public synchronized static Path recoverJobHistoryFile(JobConf conf, + Path logFilePath) + throws IOException { + Path ret; + String logFileName = logFilePath.getName(); + String tmpFilename = getSecondaryJobHistoryFile(logFileName); + Path logDir = logFilePath.getParent(); + Path tmpFilePath = new Path(logDir, tmpFilename); + if (LOGDIR_FS.exists(logFilePath)) { + LOG.info(logFileName + " exists!"); + if (LOGDIR_FS.exists(tmpFilePath)) { + LOG.info("Deleting " + tmpFilename + + " and using " + logFileName + " for recovery."); + LOGDIR_FS.delete(tmpFilePath, false); + } + ret = tmpFilePath; + } else { + LOG.info(logFileName + " doesnt exist! Using " + + tmpFilename + " for recovery."); + if (LOGDIR_FS.exists(tmpFilePath)) { + LOG.info("Renaming " + tmpFilename + " to " + logFileName); + LOGDIR_FS.rename(tmpFilePath, logFilePath); + ret = tmpFilePath; + } else { + ret = logFilePath; + } + } + + // do the same for the user files too + logFilePath = getJobHistoryLogLocationForUser(logFileName, conf); + if (logFilePath != null) { + FileSystem fs = logFilePath.getFileSystem(conf); + logDir = logFilePath.getParent(); + tmpFilePath = new Path(logDir, tmpFilename); + if (fs.exists(logFilePath)) { + LOG.info(logFileName + " exists!"); + if (fs.exists(tmpFilePath)) { + LOG.info("Deleting " + tmpFilename + " and making " + logFileName + + " as the master history file for user."); + fs.delete(tmpFilePath, false); + } + } else { + LOG.info(logFileName + " doesnt exist! Using " + + tmpFilename + " as the master history file for user."); + if (fs.exists(tmpFilePath)) { + LOG.info("Renaming " + tmpFilename + " to " + logFileName + + " in user directory"); + fs.rename(tmpFilePath, logFilePath); + } + } + } + + return ret; + } + + /** Finalize the recovery and make one file in the end. + * This invloves renaming the recover file to the master file. + * Note that this api should be invoked only if recovery is involved. + * @param id Job id + * @param conf the job conf + * @throws IOException + */ + static synchronized void finalizeRecovery(JobID id, JobConf conf) + throws IOException { + Path tmpLogPath = fileManager.getHistoryFile(id); + if (tmpLogPath == null) { + if (LOG.isDebugEnabled()) { + LOG.debug("No file for job with " + id + " found in cache!"); + } + return; + } + String tmpLogFileName = tmpLogPath.getName(); + + // get the primary filename from the cached filename + String masterLogFileName = + getPrimaryFilename(tmpLogFileName, getJobName(conf)); + Path masterLogPath = new Path(tmpLogPath.getParent(), masterLogFileName); + + // rename the tmp file to the master file. Note that this should be + // done only when the file is closed and handles are released. + LOG.info("Renaming " + tmpLogFileName + " to " + masterLogFileName); + LOGDIR_FS.rename(tmpLogPath, masterLogPath); + // update the cache + fileManager.setHistoryFile(id, masterLogPath); + + // do the same for the user file too + masterLogPath = + JobHistoryCopy.JobInfo.getJobHistoryLogLocationForUser(masterLogFileName, + conf); + tmpLogPath = + JobHistoryCopy.JobInfo.getJobHistoryLogLocationForUser(tmpLogFileName, + conf); + if (masterLogPath != null) { + FileSystem fs = masterLogPath.getFileSystem(conf); + if (fs.exists(tmpLogPath)) { + LOG.info("Renaming " + tmpLogFileName + " to " + masterLogFileName + + " in user directory"); + fs.rename(tmpLogPath, masterLogPath); + } + } + } + + /** + * Deletes job data from the local disk. + * For now just deletes the localized copy of job conf + */ + static void cleanupJob(JobID id) { + String localJobFilePath = JobInfo.getLocalJobFilePath(id); + File f = new File (localJobFilePath); + LOG.info("Deleting localized job conf at " + f); + if (!f.delete()) { + if (LOG.isDebugEnabled()) { + LOG.debug("Failed to delete file " + f); + } + } + } + + /** + * Delete job conf from the history folder. + */ + static void deleteConfFiles() throws IOException { + LOG.info("Cleaning up config files from the job history folder"); + FileSystem fs = new Path(LOG_DIR).getFileSystem(jtConf); + FileStatus[] status = fs.listStatus(new Path(LOG_DIR), CONF_FILTER); + for (FileStatus s : status) { + LOG.info("Deleting conf file " + s.getPath()); + fs.delete(s.getPath(), false); + } + } + + /** + * Move the completed job into the completed folder. + * This assumes that the jobhistory file is closed and all operations on the + * jobhistory file is complete. + * This *should* be the last call to jobhistory for a given job. + */ + static void markCompleted(JobID id) throws IOException { + fileManager.moveToDone(id); + } + + /** + * Log job submitted event to history. Creates a new file in history + * for the job. if history file creation fails, it disables history + * for all other events. + * @param jobId job id assigned by job tracker. + * @param jobConf job conf of the job + * @param jobConfPath path to job conf xml file in HDFS. + * @param submitTime time when job tracker received the job + * @throws IOException + * @deprecated Use + * {@link #logSubmitted(JobID, JobConf, String, long, boolean)} instead. + */ + @Deprecated + public static void logSubmitted(JobID jobId, JobConf jobConf, + String jobConfPath, long submitTime) + throws IOException { + logSubmitted(jobId, jobConf, jobConfPath, submitTime, true); + } + + public static void logSubmitted(JobID jobId, JobConf jobConf, + String jobConfPath, long submitTime, + boolean restarted) + throws IOException { + FileSystem fs = null; + String userLogDir = null; + String jobUniqueString = JOBTRACKER_UNIQUE_STRING + jobId; + + // Get the username and job name to be used in the actual log filename; + // sanity check them too + String jobName = getJobName(jobConf); + String user = getUserName(jobConf); + + // get the history filename + String logFileName = null; + if (restarted) { + logFileName = getJobHistoryFileName(jobConf, jobId); + if (logFileName == null) { + logFileName = + encodeJobHistoryFileName(getNewJobHistoryFileName(jobConf, jobId)); + } else { + String parts[] = logFileName.split("_"); + //TODO this is a hack :( + // jobtracker-hostname_jobtracker-identifier_ + String jtUniqueString = parts[0] + "_" + parts[1] + "_"; + jobUniqueString = jtUniqueString + jobId.toString(); + } + } else { + logFileName = + encodeJobHistoryFileName(getNewJobHistoryFileName(jobConf, jobId)); + } + + // setup the history log file for this job + Path logFile = getJobHistoryLogLocation(logFileName); + + // find user log directory + Path userLogFile = + getJobHistoryLogLocationForUser(logFileName, jobConf); + PrintWriter writer = null; + try{ + FSDataOutputStream out = null; + if (LOG_DIR != null) { + // create output stream for logging in hadoop.job.history.location + if (restarted) { + logFile = recoverJobHistoryFile(jobConf, logFile); + logFileName = logFile.getName(); + } + + int defaultBufferSize = + LOGDIR_FS.getConf().getInt("io.file.buffer.size", 4096); + out = LOGDIR_FS.create(logFile, + new FsPermission(HISTORY_FILE_PERMISSION), + true, + defaultBufferSize, + LOGDIR_FS.getDefaultReplication(), + jobHistoryBlockSize, null); + writer = new PrintWriter(out); + fileManager.addWriter(jobId, writer); + + // cache it ... + fileManager.setHistoryFile(jobId, logFile); + } + if (userLogFile != null) { + // Get the actual filename as recoverJobHistoryFile() might return + // a different filename + userLogDir = userLogFile.getParent().toString(); + userLogFile = new Path(userLogDir, logFileName); + + // create output stream for logging + // in hadoop.job.history.user.location + fs = userLogFile.getFileSystem(jobConf); + + out = fs.create(userLogFile, true, 4096); + writer = new PrintWriter(out); + fileManager.addWriter(jobId, writer); + } + + ArrayList writers = fileManager.getWriters(jobId); + // Log the history meta info + JobHistoryCopy.MetaInfoManager.logMetaInfo(writers); + + String viewJobACL = "*"; + String modifyJobACL = "*"; + if (aclsEnabled) { + viewJobACL = jobConf.get(JobACL.VIEW_JOB.getAclName(), " "); + modifyJobACL = jobConf.get(JobACL.MODIFY_JOB.getAclName(), " "); + } + //add to writer as well + JobHistoryCopy.log(writers, RecordTypes.Job, + new JobHistoryKeys[]{JobHistoryKeys.JOBID, JobHistoryKeys.JOBNAME, JobHistoryKeys.USER, + JobHistoryKeys.SUBMIT_TIME, JobHistoryKeys.JOBCONF, + JobHistoryKeys.VIEW_JOB, JobHistoryKeys.MODIFY_JOB, + JobHistoryKeys.JOB_QUEUE}, + new String[]{jobId.toString(), jobName, user, + String.valueOf(submitTime) , jobConfPath, + viewJobACL, modifyJobACL, + jobConf.getQueueName()}, jobId + ); + + }catch(IOException e){ + LOG.error("Failed creating job history log file for job " + jobId, e); + if (writer != null) { + fileManager.removeWriter(jobId, writer); + } + } + // Always store job conf on local file system + String localJobFilePath = JobInfo.getLocalJobFilePath(jobId); + File localJobFile = new File(localJobFilePath); + FileOutputStream jobOut = null; + try { + jobOut = new FileOutputStream(localJobFile); + jobConf.writeXml(jobOut); + if (LOG.isDebugEnabled()) { + LOG.debug("Job conf for " + jobId + " stored at " + + localJobFile.getAbsolutePath()); + } + } catch (IOException ioe) { + LOG.error("Failed to store job conf on the local filesystem ", ioe); + } finally { + if (jobOut != null) { + try { + jobOut.close(); + } catch (IOException ie) { + LOG.info("Failed to close the job configuration file " + + StringUtils.stringifyException(ie)); + } + } + } + + /* Storing the job conf on the log dir */ + Path jobFilePath = null; + if (LOG_DIR != null) { + jobFilePath = new Path(LOG_DIR + File.separator + + jobUniqueString + "_conf.xml"); + fileManager.setConfFile(jobId, jobFilePath); + } + Path userJobFilePath = null; + if (userLogDir != null) { + userJobFilePath = new Path(userLogDir + File.separator + + jobUniqueString + "_conf.xml"); + } + FSDataOutputStream jobFileOut = null; + try { + if (LOG_DIR != null) { + int defaultBufferSize = + LOGDIR_FS.getConf().getInt("io.file.buffer.size", 4096); + if (!LOGDIR_FS.exists(jobFilePath)) { + jobFileOut = LOGDIR_FS.create(jobFilePath, + new FsPermission(HISTORY_FILE_PERMISSION), + true, + defaultBufferSize, + LOGDIR_FS.getDefaultReplication(), + LOGDIR_FS.getDefaultBlockSize(), null); + jobConf.writeXml(jobFileOut); + jobFileOut.close(); + } + } + if (userLogDir != null) { + fs = new Path(userLogDir).getFileSystem(jobConf); + jobFileOut = fs.create(userJobFilePath); + jobConf.writeXml(jobFileOut); + } + if (LOG.isDebugEnabled()) { + LOG.debug("Job conf for " + jobId + " stored at " + + jobFilePath + "and" + userJobFilePath ); + } + } catch (IOException ioe) { + LOG.error("Failed to store job conf in the log dir", ioe); + } finally { + if (jobFileOut != null) { + try { + jobFileOut.close(); + } catch (IOException ie) { + LOG.info("Failed to close the job configuration file " + + StringUtils.stringifyException(ie)); + } + } + } + } + /** + * Logs launch time of job. + * + * @param jobId job id, assigned by jobtracker. + * @param startTime start time of job. + * @param totalMaps total maps assigned by jobtracker. + * @param totalReduces total reduces. + */ + public static void logInited(JobID jobId, long startTime, + int totalMaps, int totalReduces) { + ArrayList writer = fileManager.getWriters(jobId); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.LAUNCH_TIME, + JobHistoryKeys.TOTAL_MAPS, JobHistoryKeys.TOTAL_REDUCES, + JobHistoryKeys.JOB_STATUS}, + new String[] {jobId.toString(), String.valueOf(startTime), + String.valueOf(totalMaps), + String.valueOf(totalReduces), + Values.PREP.name()}, jobId); + } + } + + /** + * Logs the job as RUNNING. + * + * @param jobId job id, assigned by jobtracker. + * @param startTime start time of job. + * @param totalMaps total maps assigned by jobtracker. + * @param totalReduces total reduces. + * @deprecated Use {@link #logInited(JobID, long, int, int)} and + * {@link #logStarted(JobID)} + */ + @Deprecated + public static void logStarted(JobID jobId, long startTime, + int totalMaps, int totalReduces) { + logStarted(jobId); + } + + /** + * Logs job as running + * @param jobId job id, assigned by jobtracker. + */ + public static void logStarted(JobID jobId){ + ArrayList writer = fileManager.getWriters(jobId); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.JOB_STATUS}, + new String[] {jobId.toString(), + Values.RUNNING.name()}, jobId); + } + } + + /** + * Log job finished. closes the job file in history. + * @param jobId job id, assigned by jobtracker. + * @param finishTime finish time of job in ms. + * @param finishedMaps no of maps successfully finished. + * @param finishedReduces no of reduces finished sucessfully. + * @param failedMaps no of failed map tasks. + * @param failedReduces no of failed reduce tasks. + * @param counters the counters from the job + */ + public static void logFinished(JobID jobId, long finishTime, + int finishedMaps, int finishedReduces, + int failedMaps, int failedReduces, + Counters mapCounters, + Counters reduceCounters, + Counters counters) { + // close job file for this job + ArrayList writer = fileManager.getWriters(jobId); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.FINISH_TIME, + JobHistoryKeys.JOB_STATUS, + JobHistoryKeys.FINISHED_MAPS, + JobHistoryKeys.FINISHED_REDUCES, + JobHistoryKeys.FAILED_MAPS, + JobHistoryKeys.FAILED_REDUCES, + JobHistoryKeys.MAP_COUNTERS, + JobHistoryKeys.REDUCE_COUNTERS, + JobHistoryKeys.COUNTERS}, + new String[] {jobId.toString(), Long.toString(finishTime), + Values.SUCCESS.name(), + String.valueOf(finishedMaps), + String.valueOf(finishedReduces), + String.valueOf(failedMaps), + String.valueOf(failedReduces), + mapCounters.makeEscapedCompactString(), + reduceCounters.makeEscapedCompactString(), + counters.makeEscapedCompactString()}, jobId); + for (PrintWriter out : writer) { + out.close(); + } + } + Thread historyCleaner = new Thread(new HistoryCleaner()); + historyCleaner.start(); + } + /** + * Logs job failed event. Closes the job history log file. + * @param jobid job id + * @param timestamp time when job failure was detected in ms. + * @param finishedMaps no finished map tasks. + * @param finishedReduces no of finished reduce tasks. + */ + public static void logFailed(JobID jobid, long timestamp, int finishedMaps, int finishedReduces){ + ArrayList writer = fileManager.getWriters(jobid); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.FINISH_TIME, + JobHistoryKeys.JOB_STATUS, JobHistoryKeys.FINISHED_MAPS, + JobHistoryKeys.FINISHED_REDUCES }, + new String[] {jobid.toString(), String.valueOf(timestamp), Values.FAILED.name(), String.valueOf(finishedMaps), + String.valueOf(finishedReduces)}, jobid); + for (PrintWriter out : writer) { + out.close(); + } + } + } + /** + * Logs job killed event. Closes the job history log file. + * + * @param jobid + * job id + * @param timestamp + * time when job killed was issued in ms. + * @param finishedMaps + * no finished map tasks. + * @param finishedReduces + * no of finished reduce tasks. + */ + public static void logKilled(JobID jobid, long timestamp, int finishedMaps, + int finishedReduces) { + ArrayList writer = fileManager.getWriters(jobid); + + if (null != writer) { + JobHistoryCopy.log(writer, RecordTypes.Job, new JobHistoryKeys[] { JobHistoryKeys.JOBID, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.JOB_STATUS, JobHistoryKeys.FINISHED_MAPS, + JobHistoryKeys.FINISHED_REDUCES }, new String[] { jobid.toString(), + String.valueOf(timestamp), Values.KILLED.name(), + String.valueOf(finishedMaps), String.valueOf(finishedReduces) }, jobid); + for (PrintWriter out : writer) { + out.close(); + } + } + } + /** + * Log job's priority. + * @param jobid job id + * @param priority Jobs priority + */ + public static void logJobPriority(JobID jobid, JobPriority priority){ + ArrayList writer = fileManager.getWriters(jobid); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.JOB_PRIORITY}, + new String[] {jobid.toString(), priority.toString()}, jobid); + } + } + /** + * Log job's submit-time/launch-time + * @param jobid job id + * @param submitTime job's submit time + * @param launchTime job's launch time + * @param restartCount number of times the job got restarted + * @deprecated Use {@link #logJobInfo(JobID, long, long)} instead. + */ + @Deprecated + public static void logJobInfo(JobID jobid, long submitTime, long launchTime, + int restartCount){ + logJobInfo(jobid, submitTime, launchTime); + } + + public static void logJobInfo(JobID jobid, long submitTime, long launchTime) + { + ArrayList writer = fileManager.getWriters(jobid); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Job, + new JobHistoryKeys[] {JobHistoryKeys.JOBID, JobHistoryKeys.SUBMIT_TIME, + JobHistoryKeys.LAUNCH_TIME}, + new String[] {jobid.toString(), + String.valueOf(submitTime), + String.valueOf(launchTime)}, jobid); + } + } + } + + /** + * Helper class for logging or reading back events related to Task's start, finish or failure. + * All events logged by this class are logged in a separate file per job in + * job tracker history. These events map to TIPs in jobtracker. + */ + public static class Task extends KeyValuePair{ + private Map taskAttempts = new TreeMap(); + + /** + * Log start time of task (TIP). + * @param taskId task id + * @param taskType MAP or REDUCE + * @param startTime startTime of tip. + */ + public static void logStarted(TaskID taskId, String taskType, + long startTime, String splitLocations) { + JobID id = taskId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Task, + new JobHistoryKeys[]{JobHistoryKeys.TASKID, JobHistoryKeys.TASK_TYPE, + JobHistoryKeys.START_TIME, JobHistoryKeys.SPLITS}, + new String[]{taskId.toString(), taskType, + String.valueOf(startTime), + splitLocations}, id); + } + } + /** + * Log finish time of task. + * @param taskId task id + * @param taskType MAP or REDUCE + * @param finishTime finish timeof task in ms + */ + public static void logFinished(TaskID taskId, String taskType, + long finishTime, Counters counters){ + JobID id = taskId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Task, + new JobHistoryKeys[]{ JobHistoryKeys.TASKID, JobHistoryKeys.TASK_TYPE, + JobHistoryKeys.TASK_STATUS, JobHistoryKeys.FINISH_TIME, + JobHistoryKeys.COUNTERS}, + new String[]{ taskId.toString(), taskType, Values.SUCCESS.name(), + String.valueOf(finishTime), + counters.makeEscapedCompactString()}, id); + } + } + + /** + * Update the finish time of task. + * @param taskId task id + * @param finishTime finish time of task in ms + */ + public static void logUpdates(TaskID taskId, long finishTime){ + JobID id = taskId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.Task, + new JobHistoryKeys[]{JobHistoryKeys.TASKID, JobHistoryKeys.FINISH_TIME}, + new String[]{ taskId.toString(), + String.valueOf(finishTime)}, id); + } + } + + /** + * Log job failed event. + * @param taskId task id + * @param taskType MAP or REDUCE. + * @param time timestamp when job failed detected. + * @param error error message for failure. + */ + public static void logFailed(TaskID taskId, String taskType, long time, String error){ + logFailed(taskId, taskType, time, error, null); + } + + /** + * @param failedDueToAttempt The attempt that caused the failure, if any + */ + public static void logFailed(TaskID taskId, String taskType, long time, + String error, + TaskAttemptID failedDueToAttempt){ + JobID id = taskId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + String failedAttempt = failedDueToAttempt == null + ? "" + : failedDueToAttempt.toString(); + JobHistoryCopy.log(writer, RecordTypes.Task, + new JobHistoryKeys[]{ JobHistoryKeys.TASKID, JobHistoryKeys.TASK_TYPE, + JobHistoryKeys.TASK_STATUS, JobHistoryKeys.FINISH_TIME, + JobHistoryKeys.ERROR, JobHistoryKeys.TASK_ATTEMPT_ID}, + new String[]{ taskId.toString(), taskType, + Values.FAILED.name(), + String.valueOf(time), error, + failedAttempt}, id); + } + } + /** + * Returns all task attempts for this task. + */ + public Map getTaskAttempts(){ + return this.taskAttempts; + } + } + + /** + * Base class for Map and Reduce TaskAttempts. + */ + public static class TaskAttempt extends Task{} + + /** + * Helper class for logging or reading back events related to start, finish or failure of + * a Map Attempt on a node. + */ + public static class MapAttempt extends TaskAttempt{ + /** + * Log start time of this map task attempt. + * @param taskAttemptId task attempt id + * @param startTime start time of task attempt as reported by task tracker. + * @param hostName host name of the task attempt. + * @deprecated Use + * {@link #logStarted(TaskAttemptID, long, String, int, String)} + */ + @Deprecated + public static void logStarted(TaskAttemptID taskAttemptId, long startTime, String hostName){ + logStarted(taskAttemptId, startTime, hostName, -1, Values.MAP.name()); + } + + /** + * Log start time of this map task attempt. + * + * @param taskAttemptId task attempt id + * @param startTime start time of task attempt as reported by task tracker. + * @param trackerName name of the tracker executing the task attempt. + * @param httpPort http port of the task tracker executing the task attempt + * @param taskType Whether the attempt is cleanup or setup or map + */ + public static void logStarted(TaskAttemptID taskAttemptId, long startTime, + String trackerName, int httpPort, + String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.MapAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.START_TIME, + JobHistoryKeys.TRACKER_NAME, JobHistoryKeys.HTTP_PORT}, + new String[]{taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + String.valueOf(startTime), trackerName, + httpPort == -1 ? "" : + String.valueOf(httpPort)}, id); + } + } + + /** + * Log finish time of map task attempt. + * @param taskAttemptId task attempt id + * @param finishTime finish time + * @param hostName host name + * @deprecated Use + * {@link #logFinished(TaskAttemptID, long, String, String, String, Counters)} + */ + @Deprecated + public static void logFinished(TaskAttemptID taskAttemptId, long finishTime, + String hostName){ + logFinished(taskAttemptId, finishTime, hostName, Values.MAP.name(), "", + new Counters()); + } + + /** + * Log finish time of map task attempt. + * + * @param taskAttemptId task attempt id + * @param finishTime finish time + * @param hostName host name + * @param taskType Whether the attempt is cleanup or setup or map + * @param stateString state string of the task attempt + * @param counter counters of the task attempt + */ + public static void logFinished(TaskAttemptID taskAttemptId, + long finishTime, + String hostName, + String taskType, + String stateString, + Counters counter) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.MapAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, + JobHistoryKeys.STATE_STRING, JobHistoryKeys.COUNTERS}, + new String[]{taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.SUCCESS.name(), + String.valueOf(finishTime), hostName, + stateString, + counter.makeEscapedCompactString()}, id); + } + } + + /** + * Log task attempt failed event. + * @param taskAttemptId task attempt id + * @param timestamp timestamp + * @param hostName hostname of this task attempt. + * @param error error message if any for this task attempt. + * @deprecated Use + * {@link #logFailed(TaskAttemptID, long, String, String, String)} + */ + @Deprecated + public static void logFailed(TaskAttemptID taskAttemptId, + long timestamp, String hostName, + String error) { + logFailed(taskAttemptId, timestamp, hostName, error, Values.MAP.name()); + } + + /** + * Log task attempt failed event. + * + * @param taskAttemptId task attempt id + * @param timestamp timestamp + * @param hostName hostname of this task attempt. + * @param error error message if any for this task attempt. + * @param taskType Whether the attempt is cleanup or setup or map + */ + public static void logFailed(TaskAttemptID taskAttemptId, + long timestamp, String hostName, + String error, String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.MapAttempt, + new JobHistoryKeys[]{JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, JobHistoryKeys.ERROR}, + new String[]{ taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.FAILED.name(), + String.valueOf(timestamp), + hostName, error}, id); + } + } + + /** + * Log task attempt killed event. + * @param taskAttemptId task attempt id + * @param timestamp timestamp + * @param hostName hostname of this task attempt. + * @param error error message if any for this task attempt. + * @deprecated Use + * {@link #logKilled(TaskAttemptID, long, String, String, String)} + */ + @Deprecated + public static void logKilled(TaskAttemptID taskAttemptId, + long timestamp, String hostName, String error){ + logKilled(taskAttemptId, timestamp, hostName, error, Values.MAP.name()); + } + + /** + * Log task attempt killed event. + * + * @param taskAttemptId task attempt id + * @param timestamp timestamp + * @param hostName hostname of this task attempt. + * @param error error message if any for this task attempt. + * @param taskType Whether the attempt is cleanup or setup or map + */ + public static void logKilled(TaskAttemptID taskAttemptId, + long timestamp, String hostName, + String error, String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.MapAttempt, + new JobHistoryKeys[]{JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, + JobHistoryKeys.ERROR}, + new String[]{ taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.KILLED.name(), + String.valueOf(timestamp), + hostName, error}, id); + } + } + } + /** + * Helper class for logging or reading back events related to start, finish or failure of + * a Map Attempt on a node. + */ + public static class ReduceAttempt extends TaskAttempt{ + /** + * Log start time of Reduce task attempt. + * @param taskAttemptId task attempt id + * @param startTime start time + * @param hostName host name + * @deprecated Use + * {@link #logStarted(TaskAttemptID, long, String, int, String)} + */ + @Deprecated + public static void logStarted(TaskAttemptID taskAttemptId, + long startTime, String hostName){ + logStarted(taskAttemptId, startTime, hostName, -1, Values.REDUCE.name()); + } + + /** + * Log start time of Reduce task attempt. + * + * @param taskAttemptId task attempt id + * @param startTime start time + * @param trackerName tracker name + * @param httpPort the http port of the tracker executing the task attempt + * @param taskType Whether the attempt is cleanup or setup or reduce + */ + public static void logStarted(TaskAttemptID taskAttemptId, + long startTime, String trackerName, + int httpPort, + String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.ReduceAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.START_TIME, + JobHistoryKeys.TRACKER_NAME, JobHistoryKeys.HTTP_PORT}, + new String[]{taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + String.valueOf(startTime), trackerName, + httpPort == -1 ? "" : + String.valueOf(httpPort)}, id); + } + } + + /** + * Log finished event of this task. + * @param taskAttemptId task attempt id + * @param shuffleFinished shuffle finish time + * @param sortFinished sort finish time + * @param finishTime finish time of task + * @param hostName host name where task attempt executed + * @deprecated Use + * {@link #logFinished(TaskAttemptID, long, long, long, String, String, String, Counters)} + */ + @Deprecated + public static void logFinished(TaskAttemptID taskAttemptId, long shuffleFinished, + long sortFinished, long finishTime, + String hostName){ + logFinished(taskAttemptId, shuffleFinished, sortFinished, + finishTime, hostName, Values.REDUCE.name(), + "", new Counters()); + } + + /** + * Log finished event of this task. + * + * @param taskAttemptId task attempt id + * @param shuffleFinished shuffle finish time + * @param sortFinished sort finish time + * @param finishTime finish time of task + * @param hostName host name where task attempt executed + * @param taskType Whether the attempt is cleanup or setup or reduce + * @param stateString the state string of the attempt + * @param counter counters of the attempt + */ + public static void logFinished(TaskAttemptID taskAttemptId, + long shuffleFinished, + long sortFinished, long finishTime, + String hostName, String taskType, + String stateString, Counters counter) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.ReduceAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.SHUFFLE_FINISHED, JobHistoryKeys.SORT_FINISHED, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, + JobHistoryKeys.STATE_STRING, JobHistoryKeys.COUNTERS}, + new String[]{taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.SUCCESS.name(), + String.valueOf(shuffleFinished), + String.valueOf(sortFinished), + String.valueOf(finishTime), hostName, + stateString, + counter.makeEscapedCompactString()}, id); + } + } + + /** + * Log failed reduce task attempt. + * @param taskAttemptId task attempt id + * @param timestamp time stamp when task failed + * @param hostName host name of the task attempt. + * @param error error message of the task. + * @deprecated Use + * {@link #logFailed(TaskAttemptID, long, String, String, String)} + */ + @Deprecated + public static void logFailed(TaskAttemptID taskAttemptId, long timestamp, + String hostName, String error){ + logFailed(taskAttemptId, timestamp, hostName, error, Values.REDUCE.name()); + } + + /** + * Log failed reduce task attempt. + * + * @param taskAttemptId task attempt id + * @param timestamp time stamp when task failed + * @param hostName host name of the task attempt. + * @param error error message of the task. + * @param taskType Whether the attempt is cleanup or setup or reduce + */ + public static void logFailed(TaskAttemptID taskAttemptId, long timestamp, + String hostName, String error, + String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.ReduceAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, + JobHistoryKeys.ERROR }, + new String[]{ taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.FAILED.name(), + String.valueOf(timestamp), hostName, error }, id); + } + } + + /** + * Log killed reduce task attempt. + * @param taskAttemptId task attempt id + * @param timestamp time stamp when task failed + * @param hostName host name of the task attempt. + * @param error error message of the task. + * @deprecated Use + * {@link #logKilled(TaskAttemptID, long, String, String, String)} + */ + @Deprecated + public static void logKilled(TaskAttemptID taskAttemptId, long timestamp, + String hostName, String error) { + logKilled(taskAttemptId, timestamp, hostName, error, Values.REDUCE.name()); + } + + /** + * Log killed reduce task attempt. + * + * @param taskAttemptId task attempt id + * @param timestamp time stamp when task failed + * @param hostName host name of the task attempt. + * @param error error message of the task. + * @param taskType Whether the attempt is cleanup or setup or reduce + */ + public static void logKilled(TaskAttemptID taskAttemptId, long timestamp, + String hostName, String error, + String taskType) { + JobID id = taskAttemptId.getJobID(); + ArrayList writer = fileManager.getWriters(id); + + if (null != writer){ + JobHistoryCopy.log(writer, RecordTypes.ReduceAttempt, + new JobHistoryKeys[]{ JobHistoryKeys.TASK_TYPE, JobHistoryKeys.TASKID, + JobHistoryKeys.TASK_ATTEMPT_ID, JobHistoryKeys.TASK_STATUS, + JobHistoryKeys.FINISH_TIME, JobHistoryKeys.HOSTNAME, + JobHistoryKeys.ERROR }, + new String[]{ taskType, + taskAttemptId.getTaskID().toString(), + taskAttemptId.toString(), + Values.KILLED.name(), + String.valueOf(timestamp), + hostName, error }, id); + } + } + } + + /** + * Callback interface for reading back log events from JobHistory. This interface + * should be implemented and passed to JobHistory.parseHistory() + * + */ + public static interface Listener{ + /** + * Callback method for history parser. + * @param recType type of record, which is the first entry in the line. + * @param values a map of key-value pairs as thry appear in history. + * @throws IOException + */ + public void handle(RecordTypes recType, Map values) throws IOException; + } + + /** + * Delete history files older than one month. Update master index and remove all + * jobs older than one month. Also if a job tracker has no jobs in last one month + * remove reference to the job tracker. + * + */ + public static class HistoryCleaner implements Runnable{ + static final long ONE_DAY_IN_MS = 24 * 60 * 60 * 1000L; + static final long THIRTY_DAYS_IN_MS = 30 * ONE_DAY_IN_MS; + private long now; + private static boolean isRunning = false; + private static long lastRan = 0; + + /** + * Cleans up history data. + */ + public void run(){ + if (isRunning){ + return; + } + now = System.currentTimeMillis(); + // clean history only once a day at max + if (lastRan != 0 && (now - lastRan) < ONE_DAY_IN_MS) { + return; + } + lastRan = now; + isRunning = true; + try { + FileStatus[] historyFiles = DONEDIR_FS.listStatus(DONE); + // delete if older than 30 days + if (historyFiles != null) { + for (FileStatus f : historyFiles) { + if (now - f.getModificationTime() > THIRTY_DAYS_IN_MS) { + DONEDIR_FS.delete(f.getPath(), true); + LOG.info("Deleting old history file : " + f.getPath()); + } + } + } + + //walking over the map to purge entries from jobHistoryFileMap + synchronized (jobHistoryFileMap) { + Iterator> it = + jobHistoryFileMap.entrySet().iterator(); + while (it.hasNext()) { + MovedFileInfo info = it.next().getValue(); + if (now - info.timestamp > THIRTY_DAYS_IN_MS) { + it.remove(); + } else { + //since entries are in sorted timestamp order, no more entries + //are required to be checked + break; + } + } + } + } catch (IOException ie) { + LOG.info("Error cleaning up history directory" + + StringUtils.stringifyException(ie)); + } + isRunning = false; + } + + static long getLastRan() { + return lastRan; + } + } + + /** + * Return the TaskLogsUrl of a particular TaskAttempt + * + * @param attempt + * @return the taskLogsUrl. null if http-port or tracker-name or + * task-attempt-id are unavailable. + */ + public static String getTaskLogsUrl(JobHistoryCopy.TaskAttempt attempt) { + if (attempt.get(JobHistoryKeys.HTTP_PORT).equals("") + || attempt.get(JobHistoryKeys.TRACKER_NAME).equals("") + || attempt.get(JobHistoryKeys.TASK_ATTEMPT_ID).equals("")) { + return null; + } + + String taskTrackerName = + JobInProgress.convertTrackerNameToHostName( + attempt.get(JobHistoryKeys.TRACKER_NAME)); + return TaskLogServlet.getTaskLogUrl(taskTrackerName, attempt + .get(JobHistoryKeys.HTTP_PORT), attempt.get(JobHistoryKeys.TASK_ATTEMPT_ID)); + } +} diff --git a/hraven-etl/src/test/java/com/twitter/hraven/JobConfFileTest.java b/hraven-etl/src/test/java/com/twitter/hraven/JobConfFileTest.java new file mode 100644 index 0000000..9909921 --- /dev/null +++ b/hraven-etl/src/test/java/com/twitter/hraven/JobConfFileTest.java @@ -0,0 +1,56 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.junit.Test; + +import com.twitter.hraven.Constants; + +public class JobConfFileTest { + + @Test + public void test() { + + Pattern pigLogfilePattern = Pattern + .compile(Constants.PIG_LOGFILE_PATTERN_REGEX); + + // /var/log/pig/pig_1334818693838.log + String pigLogfile = "/var/log/pig/pig_1334818693838.log"; + Matcher matcher = pigLogfilePattern.matcher(pigLogfile); + + String runId = null; + if (matcher.matches()) { + runId = matcher.group(1); + // TODO: validate this parsing (testcase?!?) + } else { + runId = "blah"; + } + + String appId = "Distributed Lzo Indexer [/tmp/akamai/akamai"; + appId = "[/tmp/akamai/akamai"; + + int firstOpenBracketPos = appId.indexOf("["); + if (firstOpenBracketPos > -1) { + appId = appId.substring(0, firstOpenBracketPos).trim(); + } + System.out.println("appId="+appId+"."); + + } + +} diff --git a/hraven-etl/src/test/java/com/twitter/hraven/TestJobFile.java b/hraven-etl/src/test/java/com/twitter/hraven/TestJobFile.java new file mode 100644 index 0000000..39accd3 --- /dev/null +++ b/hraven-etl/src/test/java/com/twitter/hraven/TestJobFile.java @@ -0,0 +1,78 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +import com.twitter.hraven.etl.JobFile; + +/** + * Test the {@link JobFile} class. + * + */ +public class TestJobFile { + + final static String VALID_JOB_CONF_FILENAME = "hostname1.example.com_1333569494142_job_201204041958_150125_conf.xml"; + final static String VALID_JOB_HISTORY_FILENAME = "hostname1.example.com_1333569494142_job_201204041958_1599_hadoop_App1%3Asomething%3Axyz%2F04%2F03-00%3A00%3A"; + final static String VALID_JOB_CONF_FILENAME2 = "hostname2.example.com_1334279672946_job_201204130114_0020_conf.xml"; + final static String VALID_JOB_HISTORY_FILENAME2 = "hostname2.example.com_1334279672946_job_201204130114_0020_user1_JobConfParser"; + + final static String INVALID_JOB_FILENAME = "jabbedabbedoo.txt"; + + /** + * Test the conf file. + */ + @Test + public void testJobConfFile() { + + JobFile jobFile = new JobFile(VALID_JOB_CONF_FILENAME); + assertTrue("This should be a valid jobfile", jobFile.isJobConfFile()); + assertFalse("this should not be a job history file", + jobFile.isJobHistoryFile()); + assertEquals("job_201204041958_150125", jobFile.getJobid()); + assertEquals("hostname1.example.com", jobFile.getJobTracker()); + + jobFile = new JobFile(VALID_JOB_HISTORY_FILENAME); + assertFalse("This should not be a valid jobfile", jobFile.isJobConfFile()); + assertTrue("this should be a job history file", jobFile.isJobHistoryFile()); + assertEquals("job_201204041958_1599", jobFile.getJobid()); + assertEquals("hostname1.example.com", jobFile.getJobTracker()); + + jobFile = new JobFile(VALID_JOB_CONF_FILENAME2); + assertTrue("This should be a valid jobfile", jobFile.isJobConfFile()); + assertFalse("this should not be a job history file", + jobFile.isJobHistoryFile()); + assertEquals("job_201204130114_0020", jobFile.getJobid()); + assertEquals("hostname2.example.com", jobFile.getJobTracker()); + + jobFile = new JobFile(VALID_JOB_HISTORY_FILENAME2); + assertFalse("This should not be a valid jobfile", jobFile.isJobConfFile()); + assertTrue("this should be a job history file", jobFile.isJobHistoryFile()); + assertEquals("job_201204130114_0020", jobFile.getJobid()); + assertEquals("hostname2.example.com", jobFile.getJobTracker()); + + jobFile = new JobFile(INVALID_JOB_FILENAME); + assertFalse("This should not be a valid jobfile", jobFile.isJobConfFile()); + assertFalse("this should not be a job history file", + jobFile.isJobHistoryFile()); + + } + +} diff --git a/hraven-etl/src/test/java/com/twitter/hraven/etl/AssertHistoryListener.java b/hraven-etl/src/test/java/com/twitter/hraven/etl/AssertHistoryListener.java new file mode 100644 index 0000000..243e393 --- /dev/null +++ b/hraven-etl/src/test/java/com/twitter/hraven/etl/AssertHistoryListener.java @@ -0,0 +1,172 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertNotNull; +import static junit.framework.Assert.fail; + +import java.io.IOException; +import java.text.ParseException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.JobHistoryCopy; + +import com.twitter.hraven.JobHistoryKeys; +import com.twitter.hraven.Counter; +import com.twitter.hraven.CounterMap; +import com.twitter.hraven.JobDetails; + +/** + * + */ +public class AssertHistoryListener implements JobHistoryCopy.Listener { + private Log log = LogFactory.getLog(getClass()); + + private JobDetails jobDetails; + private boolean assertedCounters = false; + private boolean assertedMapCounters = false; + private boolean assertedReduceCounters = false; + + private Set> keysToAssert = + new HashSet>(); + + // Store status so we can assert against final value at the end + private String status = null; + + public AssertHistoryListener(JobDetails jobDetails) { + this.jobDetails = jobDetails; + + addExpectedKey(JobHistoryKeys.TOTAL_MAPS, jobDetails.getTotalMaps()); + addExpectedKey(JobHistoryKeys.TOTAL_REDUCES, jobDetails.getTotalReduces()); + addExpectedKey(JobHistoryKeys.FAILED_MAPS, jobDetails.getFailedMaps()); + addExpectedKey(JobHistoryKeys.FAILED_REDUCES, jobDetails.getFailedReduces()); + addExpectedKey(JobHistoryKeys.FINISH_TIME, jobDetails.getFinishTime()); + addExpectedKey(JobHistoryKeys.FINISHED_MAPS, jobDetails.getFinishedMaps()); + addExpectedKey(JobHistoryKeys.FINISHED_REDUCES, jobDetails.getFinishedReduces()); + addExpectedKey(JobHistoryKeys.JOBID, jobDetails.getJobId()); + addExpectedKey(JobHistoryKeys.JOBNAME, jobDetails.getJobName()); + addExpectedKey(JobHistoryKeys.LAUNCH_TIME, jobDetails.getLaunchTime()); + addExpectedKey(JobHistoryKeys.JOB_PRIORITY, jobDetails.getPriority()); + addExpectedKey(JobHistoryKeys.SUBMIT_TIME, jobDetails.getSubmitTime()); + addExpectedKey(JobHistoryKeys.USER, jobDetails.getUser()); + } + + public Set getUnassertedKeys() { + Set keys = new HashSet(); + for (Pair keyPair : keysToAssert) { + keys.add(keyPair.getFirst()); + } + return keys; + } + + @Override + public void handle(JobHistoryCopy.RecordTypes recType, + Map values) throws IOException { + if (JobHistoryCopy.RecordTypes.Job != recType) { + log.warn(String.format("Not asserting record of type %s", recType)); + return; + } + String jobId = values.get(JobHistoryKeys.JOBID); + log.info("Asserting record values for job "+jobId); + + if (values.get(JobHistoryKeys.JOB_STATUS) != null) { + this.status = values.get(JobHistoryKeys.JOB_STATUS); + } + + // assert first-class stats + Iterator> iterator = keysToAssert.iterator(); + while(iterator.hasNext()) { + if(tryAssertKey(iterator.next(), values)) { iterator.remove(); } + } + + // assert counters + if (values.get(JobHistoryKeys.COUNTERS) != null) { + assertCounters(jobId, values.get(JobHistoryKeys.COUNTERS), jobDetails.getCounters()); + assertedCounters = true; + } + if (values.get(JobHistoryKeys.MAP_COUNTERS) != null) { + assertCounters(jobId, values.get(JobHistoryKeys.MAP_COUNTERS), jobDetails.getMapCounters()); + assertedMapCounters = true; + } + if (values.get(JobHistoryKeys.REDUCE_COUNTERS) != null) { + assertCounters(jobId, values.get(JobHistoryKeys.REDUCE_COUNTERS), jobDetails.getReduceCounters()); + assertedReduceCounters = true; + } + } + + public boolean assertedAllCounters() { + return assertedCounters && assertedMapCounters && assertedReduceCounters; + } + + public String getFinalStatus() { + return this.status; + } + + private void addExpectedKey(JobHistoryKeys key, Object foundFalue) { + String foundFalueString = foundFalue != null ? foundFalue.toString() : null; + this.keysToAssert.add(new Pair(key, foundFalueString)); + } + + private boolean tryAssertKey(Pair keyValuePair, + Map values) { + JobHistoryKeys key = keyValuePair.getFirst(); + String foundValue = keyValuePair.getSecond(); + + if (values.get(key) != null) { + assertEquals(String.format("Unexpected value found for key %s", key.name()), + values.get(key), foundValue); + return true; + } + return false; + } + + private void assertCounters(String jobId, String expectedEncodedCounters, CounterMap foundCounterMap) { + assertNotNull(foundCounterMap); + + Counters expCounters = null; + try { + expCounters = Counters.fromEscapedCompactString(expectedEncodedCounters); + } catch (ParseException e) { + fail("Excetion trying to parse counters: " + e.getMessage()); + } + + for (Counters.Group group : expCounters) { + String expGroupName = group.getName(); + for (Counters.Counter counter : group) { + String expName = counter.getName(); + long expValue = counter.getValue(); + + Counter foundCounter = foundCounterMap.getCounter(expGroupName, expName); + assertNotNull(String.format( + "Counter not found for job=%s, group=%s, name=%s", jobId, expGroupName, expName), foundCounter); + assertEquals(String.format("Unexpected counter group"), + expGroupName, foundCounter.getGroup()); + assertEquals(String.format("Unexpected counter name for job=%s, group=%s", jobId, expGroupName), + expName, foundCounter.getKey()); + assertEquals(String.format("Unexpected counter value for job=%s, group=%s, name=%s", jobId, expGroupName, expName), + expValue, foundCounter.getValue()); + } + } + } +} diff --git a/hraven-etl/src/test/java/com/twitter/hraven/etl/TestFileStatusModificationTimeComparator.java b/hraven-etl/src/test/java/com/twitter/hraven/etl/TestFileStatusModificationTimeComparator.java new file mode 100644 index 0000000..78f98b6 --- /dev/null +++ b/hraven-etl/src/test/java/com/twitter/hraven/etl/TestFileStatusModificationTimeComparator.java @@ -0,0 +1,66 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static org.junit.Assert.assertEquals; + +import org.apache.hadoop.fs.FileStatus; +import org.junit.Test; + +import com.twitter.hraven.etl.FileStatusModificationComparator; + +/** + * Test the FileStatusModificationComparator + */ +public class TestFileStatusModificationTimeComparator { + + private static final FileStatus fileStatus1 = new FileStatus(0, false, 0, 0, + 13, null); + private static final FileStatus fileStatus2 = new FileStatus(0, false, 0, 0, + 17, null); + + /** + * Do the needed. + */ + @Test + public void testCompare() { + + FileStatusModificationComparator fsModComp = new FileStatusModificationComparator(); + + assertEquals(0, fsModComp.compare(fileStatus1, fileStatus1)); + assertEquals(0, fsModComp.compare(fileStatus2, fileStatus2)); + assertEquals(0, fsModComp.compare(null, null)); + + // Smaller + assertEquals(-1, fsModComp.compare(null, fileStatus1)); + assertEquals(-1, fsModComp.compare(null, fileStatus2)); + assertEquals(-1, fsModComp.compare(fileStatus1, fileStatus2)); + + // Bigger + assertEquals(1, fsModComp.compare(fileStatus1, null)); + assertEquals(1, fsModComp.compare(fileStatus2, null)); + assertEquals(1, fsModComp.compare(fileStatus2, fileStatus1)); + + int x = 10; + int y = 3; + int q = x / y; + int r = x % y; + int b = (r > 0) ? (x / y) + 1 : (x / y); + System.out.println("x=" + x + " y=" + y + " q=" + q + " r=" + r + " b=" + b); + + } + +} diff --git a/hraven-etl/src/test/java/com/twitter/hraven/etl/TestProcessRecord.java b/hraven-etl/src/test/java/com/twitter/hraven/etl/TestProcessRecord.java new file mode 100644 index 0000000..6bd9e4b --- /dev/null +++ b/hraven-etl/src/test/java/com/twitter/hraven/etl/TestProcessRecord.java @@ -0,0 +1,81 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.hraven.etl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import org.junit.Test; + +import com.twitter.hraven.etl.ProcessRecord; +import com.twitter.hraven.etl.ProcessState; + +/** + * Test {@link ProcessRecord}, and specifically the key construction and + * deconstruction.s + * + */ +public class TestProcessRecord { + + private static final String CLUSTER = "cluster@identifier"; + private static final ProcessState PROCESS_STATE = ProcessState.CREATED; + private static final long MIN_MODIFICATION_TIME_MILLIS = 1336115621494L; + private static final long MAX_MODIFICATION_TIME_MILLIS = 1336115732505L; + private static final int PROCESSED_JOB_FILES = 7; + private static final String PROCESSING_DIRECTORY = "/hadoop/mapred/history/processing/20120503061229"; + + @Test + public void testConstructors() { + ProcessRecord processRecord = new ProcessRecord(CLUSTER, + MIN_MODIFICATION_TIME_MILLIS, MAX_MODIFICATION_TIME_MILLIS, + PROCESSED_JOB_FILES, PROCESSING_DIRECTORY); + ProcessRecord processRecord2 = new ProcessRecord( + processRecord.getCluster(), PROCESS_STATE, + processRecord.getMinModificationTimeMillis(), + processRecord.getMaxModificationTimeMillis(), + processRecord.getProcessedJobFiles(), + processRecord.getProcessFile(), null, null); + + assertEquals(processRecord.getKey(), processRecord2.getKey()); + assertEquals(processRecord.getCluster(), processRecord2.getCluster()); + assertEquals(processRecord.getMaxModificationTimeMillis(), + processRecord2.getMaxModificationTimeMillis()); + assertEquals(processRecord.getMinModificationTimeMillis(), + processRecord2.getMinModificationTimeMillis()); + assertEquals(processRecord.getProcessedJobFiles(), + processRecord2.getProcessedJobFiles()); + assertEquals(processRecord.getProcessFile(), + processRecord2.getProcessFile()); + assertEquals(processRecord.getMinJobId(), + processRecord2.getMinJobId()); + assertEquals(processRecord.getMaxJobId(), + processRecord2.getMaxJobId()); + + + assertEquals(CLUSTER, processRecord2.getCluster()); + assertEquals(MAX_MODIFICATION_TIME_MILLIS, + processRecord2.getMaxModificationTimeMillis()); + assertEquals(MIN_MODIFICATION_TIME_MILLIS, + processRecord2.getMinModificationTimeMillis()); + assertEquals(PROCESSED_JOB_FILES, processRecord2.getProcessedJobFiles()); + assertEquals(PROCESSING_DIRECTORY, processRecord2.getProcessFile()); + assertNull(processRecord2.getMinJobId()); + + // TODO: Add a minJobId and maxJobId value test + + } + +} diff --git a/hraven-etl/src/test/resources/log4j.properties b/hraven-etl/src/test/resources/log4j.properties new file mode 100644 index 0000000..96df477 --- /dev/null +++ b/hraven-etl/src/test/resources/log4j.properties @@ -0,0 +1,20 @@ +log4j.rootCategory=INFO,console + +# +# console +# Add "console" to rootlogger above if you want to use this +# +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d %-5p [%t] %C{2}(%L): %m%n + +# Custom Logging levels + +#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG + +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.zookeeper=ERROR +log4j.logger.org.apache.hadoop.hbase=INFO + +log4j.logger.com.twitter.hraven=DEBUG diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..dd2cfbb --- /dev/null +++ b/pom.xml @@ -0,0 +1,228 @@ + + + + 4.0.0 + com.twitter.hraven + hraven + 0.9.0-SNAPSHOT + hRaven Project + + hRaven collects run time data and statistics from map reduce jobs running on Hadoop clusters + and stores the collected job history in an easily queryable format. For the jobs that are run + through frameworks (Pig or Scalding/Cascading) that decompose a script or application into + a DAG of map reduce jobs for actual execution, hRaven groups job history data together by an + application construct. This will allow for easier visualization of all of the component + jobs' execution for an application and more comprehensive trending and analysis over time. + + pom + + + hraven-core + hraven-etl + hraven-assembly + + + + Twitter + https://twitter.com + + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + scm:git:https://github.com/twitter/hraven.git + scm:git:https://github.com/twitter/hraven.git + https://github.com/twitter/hraven + + + + + Gary Helmling + ghelmling@twitter.com + + + Chris Trezzo + ctrezzo@twitter.com + + + Joep Rottinghuis + joep@twitter.com + + + Vrushali Channapattan + vchannapattan@twitter.com + + + + + + sonatype-nexus-snapshots + Sonatype OSS + https://oss.sonatype.org/content/repositories/snapshots + + + sonatype-nexus-staging + Nexus Release Repository + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + + sonatype-nexus-snapshots + https://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + + + Github + https://github.com/twitter/hRaven/issues + + + + + hRaven Users + hraven-user@googlegroups.com + https://groups.google.com/d/forum/hraven-user + + + hRaven Development + hraven-dev@googlegroups.com + https://groups.google.com/d/forum/hraven-dev + + + + + 1.6 + UTF-8 + 0.94.3 + 1.0.4 + + 1.1.1 + 4.8.1 + 1.8.5 + 1.2.15 + 12.0 + 1.12 + 1.9.6 + + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.1 + + forked-path + false + -Psonatype-oss-release + + + + + + + maven-deploy-plugin + 2.7 + + false + + + + org.apache.rat + apache-rat-plugin + 0.9 + + + + + + maven-assembly-plugin + 2.3 + + true + false + + + + + + + sonatype-oss-release + + + + org.apache.maven.plugins + maven-source-plugin + 2.1.2 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.7 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.1 + + + sign-artifacts + verify + + sign + + + + + + + + + + diff --git a/todo b/todo new file mode 100644 index 0000000..7466add --- /dev/null +++ b/todo @@ -0,0 +1,7 @@ +Rename *Test to Test* so that maven runs the tests +Do not use sysout but use log instead (should use log4J instead?) + + +OpenSource issues to resolve: +Headers on all files +Strip cluster names out (how to do that effectively for ruby scripts ?!?) Leave cluster name, but rename smf1 to dc1, so exp@dc1, dw@dc1 \ No newline at end of file