Navigation Menu

Skip to content
This repository has been archived by the owner on Jan 15, 2022. It is now read-only.

Commit

Permalink
Update etl scripts
Browse files Browse the repository at this point in the history
Contains environments variables for etl scripts. Factorize some code and creates
symbolic links for hraven-core.jar and hraven-etl.jar if needed. Etl scripts now source this file.

Source hraven-etl-env.sh and add default values for costFile and rawFileSizeLimit.
Also calls jobFilePreProcessor.sh and jobFileProcessor.sh  with right arity and parameters from now.

source hraven-etl-env.sh

source hraven-etl-env.sh and add defaultRawFileSizeLimit to parameters

exports HBASE_CLASSPATH
  • Loading branch information
y.bismuth committed May 19, 2014
1 parent d1b12f8 commit 6b76f95
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 41 deletions.
37 changes: 37 additions & 0 deletions bin/etl/hraven-etl-env.sh
@@ -0,0 +1,37 @@
#!/bin/bash
#
# Copyright 2013 Twitter, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Used to configure hraven-etl environment

home=$(dirname $0)
source $home/../../conf/hraven-env.sh
source $home/pidfiles.sh

#check if hraven-core.jar and hraven-etl.jar exist
#if not, create symbolic links to the needed jars
#we assume either they both exist, or none does
libhraven=`cd $(dirname $0)/../../lib;pwd;`
if [ ! -f $libhraven/hraven-core.jar ] || [ ! -f $libhraven/hraven-etl.jar ]
then
ln -s $libhraven/hraven-core-*.jar $libhraven/hraven-core.jar
ln -s $libhraven/hraven-etl-*.jar $libhraven/hraven-etl.jar
fi

# set the hraven-core jar as part of libjars and hadoop classpath
# set this here because it only pertains to the etl logic
export LIBJARS=$home/../../lib/hraven-core.jar
export HADOOP_CLASSPATH=$home/../../lib/*:$LIBJARS:$HBASE_CLASSPATH
hravenEtlJar=$home/../../lib/hraven-etl.jar
36 changes: 18 additions & 18 deletions bin/etl/hraven-etl.sh
Expand Up @@ -28,34 +28,34 @@

# Parameters
########## FILL IN APPROPRIATE VALUES BELOW ##########
cluster="mycluster"
cluster="mycluster" #Name of your cluster (arbitrary)
mapredmaxsplitsize="204800"
batchsize="100"
schedulerpoolname="mypool"
batchsize="100" #default is 1, which is bad for mapred job
schedulerpoolname="mypool" #name of scheduler pool (arbitrary)
threads="20"
defaultrawfilesizelimit="524288000"
machinetype="mymachine" #name of machine (arbitrary)
costfile=/var/lib/hraven/conf/costFile
#conf directories
hadoopconfdir=${HADOOP_CONF_DIR:-$HADOOP_HOME/conf}
hbaseconfdir=${HBASE_CONF_DIR:-$HBASE_HOME/conf}
# HDFS directories for processing and loading job history data
historyRawDir=/hadoop/mapred/history/done
historyProcessingDir=/hadoop/mapred/history/processing/
historyRawDir=/yarn/history/done/
historyProcessingDir=/hraven/processing/
#######################################################

home=$(dirname $0)

# set the hraven-core jar as part of libjars and hadoop classpath
# set this here because it only pertains to the etl logic
export LIBJARS=$home/../../lib/hraven-core.jar
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$LIBJARS
hravenEtlJar=$home/../../lib/hraven-etl.jar
#If costfile is empty, fill it with default values
if [[ -z `cat $costfile` ]]; then
echo "$machinetype.computecost=3.0" > $costfile
echo "$machinetype.machinememory=12000" >> $costfile
fi

source $home/../../conf/hraven-env.sh
source $home/pidfiles.sh
source hraven-etl-env.sh

# Each job has 2 files: history and config
batchsizejobs=$(($batchsize / 2))

myscriptname=$(basename "$0" .sh)
stopfile=$HRAVEN_PID_DIR/$myscriptname.stop

if [ -f $stopfile ]; then
echo "Error: not allowed to run. Remove $stopfile continue." 1>&2
exit 1
Expand All @@ -65,10 +65,10 @@ create_pidfile $HRAVEN_PID_DIR
trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT

# Pre-process
$home/jobFilePreprocessor.sh $hadoopconfdir $historyRawDir $historyProcessingDir $cluster $batchsize
$home/jobFilePreprocessor.sh $hadoopconfdir $historyRawDir $historyProcessingDir $cluster $batchsize $defaultrawfilesizelimit

# Load
$home/jobFileLoader.sh $hadoopconfdir $mapredmaxsplitsize $schedulerpoolname $cluster $historyProcessingDir

# Process
$home/jobFileProcessor.sh $hbaseconfdir $schedulerpoolname $historyProcessingDir $cluster $threads $batchsize
$home/jobFileProcessor.sh $hbaseconfdir $schedulerpoolname $historyProcessingDir $cluster $threads $batchsize $machinetype $costfile
7 changes: 2 additions & 5 deletions bin/etl/jobFileLoader.sh
Expand Up @@ -25,13 +25,10 @@ then
exit 1
fi

home=$(dirname $0)
source $home/../../conf/hraven-env.sh
source $home/pidfiles.sh
source hraven-etl-env.sh

myscriptname=$(basename "$0" .sh)
hravenEtlJar=$home/../../lib/hraven-etl.jar
stopfile=$HRAVEN_PID_DIR/$myscriptname.stop
LIBJARS=$home/../../lib/hraven-core.jar

if [ -f $stopfile ]; then
echo "Error: not allowed to run. Remove $stopfile continue." 1>&2
Expand Down
16 changes: 6 additions & 10 deletions bin/etl/jobFilePreprocessor.sh
Expand Up @@ -19,21 +19,17 @@
# Usage ./jobFilePreprocessor.sh [hadoopconfdir]
# [historyrawdir] [historyprocessingdir] [cluster] [batchsize]

if [ $# -ne 5 ]
if [ $# -ne 6 ]
then
echo "Usage: `basename $0` [hadoopconfdir] [historyrawdir] [historyprocessingdir] [cluster] [batchsize]"
echo "Usage: `basename $0` [hadoopconfdir] [historyrawdir] [historyprocessingdir] [cluster] [batchsize] [defaultrawfilesizelimit]"
exit 1
fi

home=$(dirname $0)
source $home/../../conf/hraven-env.sh
source $home/pidfiles.sh
source hraven-etl-env.sh

export HADOOP_HEAPSIZE=4000
myscriptname=$(basename "$0" .sh)
stopfile=$HRAVEN_PID_DIR/$myscriptname.stop
hravenEtlJar=$home/../../lib/hraven-etl.jar
LIBJARS=$home/../../lib/hraven-core.jar
export HADOOP_HEAPSIZE=4000
export HADOOP_CLASSPATH=$(ls $home/../../lib/commons-lang-*.jar)

if [ -f $stopfile ]; then
echo "Error: not allowed to run. Remove $stopfile continue." 1>&2
Expand All @@ -43,4 +39,4 @@ fi
create_pidfile $HRAVEN_PID_DIR
trap 'cleanup_pidfile_and_exit $HRAVEN_PID_DIR' INT TERM EXIT

hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.JobFilePreprocessor -libjars=$LIBJARS -d -i $2 -o $3 -c $4 -b $5
hadoop --config $1 jar $hravenEtlJar com.twitter.hraven.etl.JobFilePreprocessor -libjars=$LIBJARS -d -i $2 -o $3 -c $4 -b $5 -s $6
8 changes: 2 additions & 6 deletions bin/etl/jobFileProcessor.sh
Expand Up @@ -27,14 +27,10 @@ then
exit 1
fi

home=$(dirname $0)
source $home/../../conf/hraven-env.sh
source $home/pidfiles.sh
source hraven-etl-env.sh

myscriptname=$(basename "$0" .sh)
hravenEtlJar=$home/../../lib/hraven-etl.jar
LIBJARS=$home/../../lib/hraven-core.jar
stopfile=$HRAVEN_PID_DIR/$myscriptname.stop
export HADOOP_CLASSPATH=$(ls $home/../../lib/commons-lang-*.jar)

if [ -f $stopfile ]; then
echo "Error: not allowed to run. Remove $stopfile continue." 1>&2
Expand Down
7 changes: 5 additions & 2 deletions conf/hraven-env.sh
Expand Up @@ -27,12 +27,15 @@
# All other hadoop configurations can be set in the standard hadoop manner, or supplied here instead.
# export HADOOP_CONF_DIR=

# export HRAVEN_CLASSPATH=`hbase classpath`
# HBASE_CLASSPATH Used in hraven-etl-env.sh
export HBASE_CLASSPATH=`hbase classpath`

# export HRAVEN_CLASSPATH=$HBASE_CLASSPATH
# export HRAVEN_CLASSPATH=`hbase --config /etc/hbase/conf-hbase-tst-dc1 classpath`
export HRAVEN_CLASSPATH=

# The maximum amount of heap to use, in MB. Default is 1000.
# export HRAVEN_HEAPSIZE=1000

# Location for process ID files for any hRaven daemons
export HRAVEN_PID_DIR=/tmp/
export HRAVEN_PID_DIR=/tmp/

0 comments on commit 6b76f95

Please sign in to comment.