Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Revert "removed out-of-date example config file (we now have examples…

… in docs)"

This reverts commit 0f72f77.
  • Loading branch information...
commit 0646440db4498d5de5d268f32a5726dcfb76c74f 1 parent c247231
David Marin authored
Showing with 84 additions and 0 deletions.
  1. +84 −0 mrjob.conf.example
View
84 mrjob.conf.example
@@ -0,0 +1,84 @@
+# This is basically the config file we use in production at Yelp, with some
+# strategic edits. ;)
+#
+# If you don't have the yaml module installed, you'll have to use JSON instead,
+# which would look something like this:
+#
+# {"runners": {
+# "emr": {
+# "aws_access_key_id": "HADOOPHADOOPBOBADOOP",
+# "aws_region": "us-west-1",
+# "aws_secret_access_key": "MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP",
+# "base_tmp_dir": "/scratch/$USER"
+# "bootstrap_python_packages": [
+# "$BT/aws/python-packages/*.tar.gz"
+# ],
+# ...
+#
+runners:
+ emr:
+ aws_access_key_id: HADOOPHADOOPBOBADOOP
+ # We run on in the west region because we're located on the west coast,
+ # and there are no eventual consistency issues with newly created S3 keys.
+ aws_region: us-west-1
+ aws_secret_access_key: MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP
+ # alternate tmp dir
+ base_tmp_dir: /scratch/$USER
+ # $BT is the path to our source tree. This lets us add modules to
+ # install on EMR by simply dumping them in this dir.
+ bootstrap_python_packages:
+ - $BT/aws/python-packages/*.tar.gz
+ # specifying an ssh key pair allows us to ssh tunnel to the job tracker
+ # and fetch logs via ssh
+ ec2_key_pair: EMR
+ ec2_key_pair_file: $BT/config/EMR.pem
+ # use beefier instances in production
+ ec2_instance_type: c1.xlarge
+ # but only use one unless overridden
+ num_ec2_instances: 1
+ # use our local time zone (this is important for deciding when
+ # days start and end, for instance)
+ cmdenv:
+ TZ: America/Los_Angeles
+ # we create the src-tree.tar.gz tarball with a Makefile. It only contains
+ # a subset of our code
+ python_archives: &python_archives
+ - $BT/aws/src-tree.tar.gz
+ # our bucket also lives in the us-west region
+ s3_log_uri: s3://walrus/tmp/logs/
+ s3_scratch_uri: s3://walrus/tmp/
+ setup_cmds: &setup_cmds
+ # these files are different between dev and production, so they're
+ # uploaded separately. copying them into place isn't safe because
+ # src-tree.tar.gz is actually shared between several mappers/reducers.
+ # Another safe approach would be to add a rule to Makefile.emr that
+ # copies these files if they haven't already been copied (setup_cmds
+ # from two mappers/reducers won't run simultaneously on the same machine)
+ - ln -sf $(readlink -f config.py) src-tree.tar.gz/config/config.py
+ - ln -sf $(readlink -f secret.py) src-tree.tar.gz/config/secret.py
+ # run Makefile.emr to compile C code (EMR has a different architecture,
+ # so we can't just upload the .so files)
+ - cd src-tree.tar.gz; make -f Makefile.emr
+ # generally, we run jobs on a Linux server separate from our desktop
+ # machine. So the SSH tunnel needs to be open so a browser on our
+ # desktop machine can connect to it.
+ ssh_tunnel_is_open: true
+ ssh_tunnel_to_job_tracker: true
+ # upload these particular files on the fly because they're different
+ # between development and production
+ upload_files: &upload_files
+ - $BT/config/config.py
+ - $BT/config/secret.py
+ hadoop:
+ # Note the use of YAML references to re-use parts of the EMR config.
+ # We don't currently run our own hadoop cluster, so this section is
+ # pretty boring.
+ base_tmp_dir: /scratch/$USER
+ python_archives: *python_archives
+ setup_cmds: *setup_cmds
+ upload_files: *upload_files
+ local:
+ # We don't have gcc installed in production, so if we have to run an
+ # MRJob in local mode in production, don't run the Makefile
+ # and whatnot; just fall back on the original copy of the code.
+ base_tmp_dir: /scratch/$USER
Please sign in to comment.
Something went wrong with that request. Please try again.