forked from icio/mrjob
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Revert "removed out-of-date example config file (we now have examples…
… in docs)" This reverts commit 0f72f77.
- Loading branch information
David Marin
committed
Oct 21, 2011
1 parent
c247231
commit 0646440
Showing
1 changed file
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# This is basically the config file we use in production at Yelp, with some | ||
# strategic edits. ;) | ||
# | ||
# If you don't have the yaml module installed, you'll have to use JSON instead, | ||
# which would look something like this: | ||
# | ||
# {"runners": { | ||
# "emr": { | ||
# "aws_access_key_id": "HADOOPHADOOPBOBADOOP", | ||
# "aws_region": "us-west-1", | ||
# "aws_secret_access_key": "MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP", | ||
# "base_tmp_dir": "/scratch/$USER" | ||
# "bootstrap_python_packages": [ | ||
# "$BT/aws/python-packages/*.tar.gz" | ||
# ], | ||
# ... | ||
# | ||
runners: | ||
emr: | ||
aws_access_key_id: HADOOPHADOOPBOBADOOP | ||
# We run on in the west region because we're located on the west coast, | ||
# and there are no eventual consistency issues with newly created S3 keys. | ||
aws_region: us-west-1 | ||
aws_secret_access_key: MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP | ||
# alternate tmp dir | ||
base_tmp_dir: /scratch/$USER | ||
# $BT is the path to our source tree. This lets us add modules to | ||
# install on EMR by simply dumping them in this dir. | ||
bootstrap_python_packages: | ||
- $BT/aws/python-packages/*.tar.gz | ||
# specifying an ssh key pair allows us to ssh tunnel to the job tracker | ||
# and fetch logs via ssh | ||
ec2_key_pair: EMR | ||
ec2_key_pair_file: $BT/config/EMR.pem | ||
# use beefier instances in production | ||
ec2_instance_type: c1.xlarge | ||
# but only use one unless overridden | ||
num_ec2_instances: 1 | ||
# use our local time zone (this is important for deciding when | ||
# days start and end, for instance) | ||
cmdenv: | ||
TZ: America/Los_Angeles | ||
# we create the src-tree.tar.gz tarball with a Makefile. It only contains | ||
# a subset of our code | ||
python_archives: &python_archives | ||
- $BT/aws/src-tree.tar.gz | ||
# our bucket also lives in the us-west region | ||
s3_log_uri: s3://walrus/tmp/logs/ | ||
s3_scratch_uri: s3://walrus/tmp/ | ||
setup_cmds: &setup_cmds | ||
# these files are different between dev and production, so they're | ||
# uploaded separately. copying them into place isn't safe because | ||
# src-tree.tar.gz is actually shared between several mappers/reducers. | ||
# Another safe approach would be to add a rule to Makefile.emr that | ||
# copies these files if they haven't already been copied (setup_cmds | ||
# from two mappers/reducers won't run simultaneously on the same machine) | ||
- ln -sf $(readlink -f config.py) src-tree.tar.gz/config/config.py | ||
- ln -sf $(readlink -f secret.py) src-tree.tar.gz/config/secret.py | ||
# run Makefile.emr to compile C code (EMR has a different architecture, | ||
# so we can't just upload the .so files) | ||
- cd src-tree.tar.gz; make -f Makefile.emr | ||
# generally, we run jobs on a Linux server separate from our desktop | ||
# machine. So the SSH tunnel needs to be open so a browser on our | ||
# desktop machine can connect to it. | ||
ssh_tunnel_is_open: true | ||
ssh_tunnel_to_job_tracker: true | ||
# upload these particular files on the fly because they're different | ||
# between development and production | ||
upload_files: &upload_files | ||
- $BT/config/config.py | ||
- $BT/config/secret.py | ||
hadoop: | ||
# Note the use of YAML references to re-use parts of the EMR config. | ||
# We don't currently run our own hadoop cluster, so this section is | ||
# pretty boring. | ||
base_tmp_dir: /scratch/$USER | ||
python_archives: *python_archives | ||
setup_cmds: *setup_cmds | ||
upload_files: *upload_files | ||
local: | ||
# We don't have gcc installed in production, so if we have to run an | ||
# MRJob in local mode in production, don't run the Makefile | ||
# and whatnot; just fall back on the original copy of the code. | ||
base_tmp_dir: /scratch/$USER |