forked from Yelp/mrjob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrjob.conf.example
84 lines (84 loc) · 3.57 KB
/
mrjob.conf.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# This is basically the config file we use in production at Yelp, with some
# strategic edits. ;)
#
# If you don't have the yaml module installed, you'll have to use JSON instead,
# which would look something like this:
#
# {"runners": {
# "emr": {
# "aws_access_key_id": "HADOOPHADOOPBOBADOOP",
# "aws_region": "us-west-1",
# "aws_secret_access_key": "MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP",
# "base_tmp_dir": "/scratch/$USER"
# "bootstrap_python_packages": [
# "$BT/aws/python-packages/*.tar.gz"
# ],
# ...
#
runners:
emr:
aws_access_key_id: HADOOPHADOOPBOBADOOP
# We run on in the west region because we're located on the west coast,
# and there are no eventual consistency issues with newly created S3 keys.
aws_region: us-west-1
aws_secret_access_key: MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP
# alternate tmp dir
base_tmp_dir: /scratch/$USER
# $BT is the path to our source tree. This lets us add modules to
# install on EMR by simply dumping them in this dir.
bootstrap_python_packages:
- $BT/aws/python-packages/*.tar.gz
# specifying an ssh key pair allows us to ssh tunnel to the job tracker
# and fetch logs via ssh
ec2_key_pair: EMR
ec2_key_pair_file: $BT/config/EMR.pem
# use beefier instances in production
ec2_instance_type: c1.xlarge
# but only use one unless overridden
num_ec2_instances: 1
# use our local time zone (this is important for deciding when
# days start and end, for instance)
cmdenv:
TZ: America/Los_Angeles
# we create the src-tree.tar.gz tarball with a Makefile. It only contains
# a subset of our code
python_archives: &python_archives
- $BT/aws/src-tree.tar.gz
# our bucket also lives in the us-west region
s3_log_uri: s3://walrus/tmp/logs/
s3_scratch_uri: s3://walrus/tmp/
setup_cmds: &setup_cmds
# these files are different between dev and production, so they're
# uploaded separately. copying them into place isn't safe because
# src-tree.tar.gz is actually shared between several mappers/reducers.
# Another safe approach would be to add a rule to Makefile.emr that
# copies these files if they haven't already been copied (setup_cmds
# from two mappers/reducers won't run simultaneously on the same machine)
- ln -sf $(readlink -f config.py) src-tree.tar.gz/config/config.py
- ln -sf $(readlink -f secret.py) src-tree.tar.gz/config/secret.py
# run Makefile.emr to compile C code (EMR has a different architecture,
# so we can't just upload the .so files)
- cd src-tree.tar.gz; make -f Makefile.emr
# generally, we run jobs on a Linux server separate from our desktop
# machine. So the SSH tunnel needs to be open so a browser on our
# desktop machine can connect to it.
ssh_tunnel_is_open: true
ssh_tunnel_to_job_tracker: true
# upload these particular files on the fly because they're different
# between development and production
upload_files: &upload_files
- $BT/config/config.py
- $BT/config/secret.py
hadoop:
# Note the use of YAML references to re-use parts of the EMR config.
# We don't currently run our own hadoop cluster, so this section is
# pretty boring.
base_tmp_dir: /scratch/$USER
python_archives: *python_archives
setup_cmds: *setup_cmds
upload_files: *upload_files
local:
# We don't have gcc installed in production, so if we have to run an
# MRJob in local mode in production, don't run the Makefile
# and whatnot; just fall back on the original copy of the code.
base_tmp_dir: /scratch/$USER