Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: 87e11b0124
Fetching contributors…

Cannot retrieve contributors at this time

executable file 226 lines (190 sloc) 7.034 kb
#!/usr/bin/env bash
#
# Copyright 2011 Twitter, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This script is used to deploy a PyCascading job remotely to a server
# where Hadoop is installed. The variables below are the defaults.
#
# This is the default server where the PyCascading script will be submitted
# to Hadoop. We assume we have SSH access to this server.
server=localhost
# This is the folder on the remote server where a temporary directory is
# going to be created for the submission. $HOME is only expanded on the
# remote server.
server_deploys_dir='$HOME/pycascading/deploys'
# The folder on the remote server where the PyCascading master jar will be
# placed. This must be given as an absolute path name so that the master
# files can be found from any directory.
server_build_dir='$HOME/pycascading/master'
# Additional SSH options (see "man ssh"; private key, etc.)
ssh_options=""
# Additional Hadoop options to be put in the run.sh runner
hadoop_options=""
# Options over, the script begins here
usage()
{
cat <<EOF
Usage: $(basename "$0") [options] <main_script> [additional_files]
The main_script gets executed by PyCascading. All additional_files are also
copied to the remote server and submitted together with the job to Hadoop.
Options:
-h Show this message.
-m Also deploy the PyCascading master jar before submitting
the job.
-f <file> Copy file to the server together with main_script, but
do not bundle it into the Hadoop jar for submission. This
option may be repeated several times for multiple files.
File names cannot start with a dot.
-s <server> The name of the remote server where Hadoop is installed,
and the PyCascading jar should be deployed to.
-o <ssh_options> Additional options for SSH (such as private key, etc.).
ssh_options is one string enclosed by "s or 's, even if
there are several parameters.
-O <hadoop_opts> Additional Hadoop options to be put in the running script.
-r Run the job immediately after submission with SSH. The
recommended way to run a script is either using screen
or nohup, so that the job doesn't get interrupted if the
terminal connection goes down. Note that no additional
command line parameters can be passed in this case for
the job.
EOF
}
# Returns the absolute path for the parameter. We cannot use either realpath
# or readlink, as these may not be installed on MacOS.
# Thanks to Simon Radford.
realpath()
{
if echo "$1" | grep '^/' >/dev/null; then
# Path is absolute
echo "$1"
else
# Path is relative to the working directory
echo "$(pwd)/$1"
fi
}
# Remove the leading slashes from a path. This is needed when we package the
# Python sources as tar does the same, and on extraction there are no leading
# slashes.
remove_leading_slash()
{
echo "$1" | sed 's/^\/*//'
}
# Copy the master jar over first? The -m option.
master_first=no
# Run job after submission with SSH?
run_immediately='dont_run'
declare -a files_to_copy
while getopts ":hmf:s:o:O:r" OPTION; do
case $OPTION in
h) usage
exit 1
;;
m) master_first=yes
;;
f) files_to_copy=("${files_to_copy[@]}" "$OPTARG")
;;
s) server="$OPTARG"
;;
o) ssh_options="$OPTARG"
;;
O) hadoop_options="$OPTARG"
;;
r) run_immediately='do_run'
;;
esac
done
shift $((OPTIND-1))
main_file="$1"
if [ "$main_file" == "" -a $master_first == no ]; then
usage
exit 3
fi
home_dir=$(realpath $(dirname "$0"))
# This is the version that works both on Linux and MacOS
tmp_dir=$(mktemp -d -t PyCascading-tmp-XXXXXX)
if [ $master_first == yes ]; then
build_dir="$home_dir/build"
if [ -a "$build_dir/pycascading.jar" -a \
-a "$build_dir/pycascading.tgz" ]; then
ln -s "$build_dir/pycascading.jar" "$build_dir/pycascading.tgz" \
"$home_dir/python/pycascading/bootstrap.py" "$tmp_dir"
else
echo 'Build the PyCascading master package first in the "java" folder with ant.'
exit 2
fi
fi
if [ "$main_file" != "" ]; then
tar -c -z -f "$tmp_dir/sources.tgz" "$@"
if [ ${#files_to_copy} -gt 0 ]; then
tar -c -z -f "$tmp_dir/others.tgz" "${files_to_copy[@]}"
fi
fi
#
# Create a setup file that will be run on the deploy server after everything
# is copied over.
#
cat >"$tmp_dir/setup.sh" <<EOF
#
# This script is run on the deploy server to set up the PyCascading job folder
#
if [ -e pycascading.jar ]; then
# If we packaged the master jar, update it
mkdir -p "$server_build_dir"
mv pycascading.jar pycascading.tgz bootstrap.py "$server_build_dir"
rm -rf "$server_build_dir/python"
tar -x -z -f "$server_build_dir/pycascading.tgz" -C "$server_build_dir"
fi
if [ -e sources.tgz ]; then
mkdir -p "$server_deploys_dir"
deploy_dir=\$(mktemp -d "$server_deploys_dir/XXXXXX")
mkdir "\$deploy_dir/job"
mv run.sh "\$deploy_dir"
tar -x -z -f sources.tgz -C "\$deploy_dir/job"
mv sources.tgz "\$deploy_dir"
if [ -e others.tgz ]; then
tar -x -z -f others.tgz -C "\$deploy_dir/job"
fi
if [ ! -e "$server_build_dir/pycascading.jar" ]; then
echo 'WARNING!!!'
echo 'The PyCascading master jar has not yet been deployed, do a "remote_deploy.sh -m" first.'
echo
fi
echo "Run the job on $server with:"
echo " \$deploy_dir/run.sh [parameters]"
fi
if [ \$1 == 'do_run' ]; then
\$deploy_dir/run.sh "\$@"
fi
EOF
chmod +x "$tmp_dir/setup.sh"
#
# Create a small script on the remote server that runs the job
#
main_file=$(remove_leading_slash "$main_file")
cat >"$tmp_dir/run.sh" <<EOF
# Run the PyCascading job
cd "\$(dirname "\$0")/job"
hadoop $hadoop_options jar "$server_build_dir/pycascading.jar" \\
"$server_build_dir/bootstrap.py" hadoop "$server_build_dir" \\
-a "$server_build_dir/pycascading.tgz" -a ../sources.tgz \\
"$main_file" "\$@"
EOF
chmod +x "$tmp_dir/run.sh"
# Upload the package to the server and run the setup script
cd "$tmp_dir"
tar -c -z -h -f - . | ssh $server $ssh_options \
"dir=\$(mktemp -d -t PyCascading-tmp-XXXXXX); cd \"\$dir\"; tar -x -z -f -; " \
"./setup.sh $run_immediately \"\$@\"; rm -r \"\$dir\""
rm -r "$tmp_dir"
Jump to Line
Something went wrong with that request. Please try again.