Permalink
Browse files

misc progress

  • Loading branch information...
1 parent c7aa505 commit 680ef976fa49b0d675f3dc4986059eb3455b39f5 @tavisrudd committed Nov 17, 2010
Showing with 207 additions and 107 deletions.
  1. +1 −0 .hgignore
  2. +3 −0 pip-requirements.txt
  3. +57 −0 process_1.sh
  4. +4 −0 process_parallel.sh
  5. +2 −0 pub_keys
  6. +9 −20 riak.org
  7. +131 −87 wikistats.py
View
@@ -0,0 +1 @@
+riak.installed.tgz
@@ -0,0 +1,3 @@
+ipython
+nose
+fabric
View
@@ -0,0 +1,57 @@
+#!/bin/bash
+#"$PATTERN" "$OUTFILE_PREFIX" # $LOG_FILE
+
+
+PATTERN=$1;
+OUTFILE_PREFIX=$2
+F=$3
+
+TIMESTAMP=$(echo $LOG_FILE | sed -e's/pagecounts-\([0-9]*\)-\([0-9][0-9]\)[0-9]*\.gz/\1-\2/' )
+
+OUTPUT_FILE=/mnt/working/results/${OUTFILE_PREFIX}-${TIMESTAMP}.out
+INPUT_FILE=/mnt/tmpfs/$LOG_FILE
+
+function get_input_file {
+ # see https://github.com/datawrangling/trendingtopics
+ # this version downloads the file from:
+ local remote="http://dammit.lt/wikistats/archive/2010/10/$LOG_FILE"
+ # and then caches it on a local memory only file system.
+
+ # An alternative approach is to mount the EBS volume snapshot
+ # here: http://aws.amazon.com/datasets/4182 In practice the
+ # snapshot restoration process (from s3) is extremely slow for
+ # massive volumes such as this. It is much faster, but more
+ # expensive, to simply download them. Ideally the individual files
+ # would have been accessible from s3.
+
+ if [[ -e $INPUT_FILE ]]; then
+ # continue an interrupted download
+ cat $INPUT_FILE | curl -C - -s -X GET $remote
+ else
+ curl -s -X GET $remote
+ fi
+}
+
+function filter_and_transform {
+ # select matching records/lines from the input and output a json
+ # document that contains an array of matches.
+ # $1=wikip-project, $2=URL, $3=hits/hr, $4=bytes/hr
+ awk -v ts=$1 -v pattern="$2" '
+ BEGIN { printf("{\"ts\":\"%s\", \"%s\", \"matches\":[\n", ts, pattern) }
+ $0 ~ pattern { printf("[\"%s\",%d,%d],\n", $2, $3, $4) }
+ END { print "]}" }'
+}
+
+function main {
+ [[ -e ${OUTPUT_FILE}.done ]] && exit 1
+
+ # get the input file, 1 hour of logs per file
+ [[ -e ${INPUT_FILE}.done ]] || get_input_file \
+ > $INPUT_FILE && touch ${INPUT_FILE}.done
+
+ # filter it based on the supplied patter, transform it to json and save it
+ zcat $INPUT_FILE | filter_and_transform $TIMESTAMP "$PATTERN" \
+ > $OUTPUT_FILE && touch ${OUTPUT_FILE}.done
+}
+
+main
View
@@ -0,0 +1,4 @@
+#!/bin/bash
+PATTERN=$1; OUTFILE_PREFIX=$2
+FILES=`head -80 /mnt/working/files.txt`
+echo $FILES | xargs -P 5 -n 1 ./process_1.sh "$PATTERN" "$OUTFILE_PREFIX" # $F
View
@@ -0,0 +1,2 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3nFwFq42u0865ymPUJ0n6C0VMj8El2alOmUoO9SxFDGJPMKFEoxPFO3o1nCkbfKpwRmidx9WDH9rox/sfsBzZLysaKgooi5T2E34T0NTk6qWzT6zFxansi3IDygDhCugM1T8kr5hQlYS6EHp5DmWsRf+vN416eWSiDN2xvCa4IQTRn4d37p9lhN/oKpFKynJZM0gv/JEGGorPLX1BFKzbtcBIcKkmSMJ3RdK/5W1eVl/isL4va6XXEIh/wZaHttuiE+UiBSOvpFMuL3cR5kMjmFuGwqz8nLMzFg+8IUI5cvc78Ix54ketZExAdl8fU60EwIHRKKJKdxFS+WOM/3iH saem@thinky
+ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAwoRD1kwvIDOurW9dENSORjkTDCiJpRxKwSOYug7fcXFlWU6SG5tdAl+P9Jmon5p1lMLW4s9rRnTyp5ZTPdAUWEQeJtkZgwkGOfJqBIp2FUrCK19ZRS9Tqouz4OFFdrAhRa2/Fg4aweOhWRMbSYnosYs8hr8ct2xun4uabYSD7ZCz5VoW8DhUBPAc0Vp3Glj4ziPWe3bjIkbT295SB47Ab8HeKMuGQjaWyTdhWAKmrEnjrInoMZDmoCLhn/Pc5mIvg594BBWwTSImxv65hicy8DDNjPWmh8IqtS3iH09udk21mNeYQYiIAfiqR8+gaK7oT6gz3mXNLWv5dv1w+6CNSw== contract@MacBook-2.local
View
@@ -7,10 +7,14 @@ ec2-authorize default -p 22
ec2-describe-spot-price-history --filter instance-type=m1.small --filter timestamp=2009-12-09*
*** ec2 instance initialization
ami-5a43a933 64-bit
-ami-5043a939 32-bit
+
+aki-4e7d9527
+
ssh-add ~/.ssh/tr-default-ec2
-export TRAMP_HOSTNAME=ec2-184-73-143-234.compute-1.amazonaws.com
+export INSTANCE_HOSTNAME=ec2-50-16-64-231.compute-1.amazonaws.com
+
+export TRAMP_HOSTNAME=
# the wikipedia data
ec2-create-volume --snapshot snap-753dfc1c -z us-east-1a
@@ -20,25 +24,9 @@ ec2-create-volume --snapshot snap-753dfc1c -z us-east-1a
export VOLUME_ID=vol-3c223155
export INSTANCE_ID=i-52a9053f
-ec2-attach-volume $VOLUME_ID -i $INSTANCE_ID -d /dev/sdf
-# ATTACHMENT vol-3c223155 i-4e2a8723 /dev/sdf attaching 2010-10-28T23:39:21+0000
-wget http://hg.basho.com/riak/downloads/riak-0.8.1.tar.gz
-tar zxvf
-cd riak-0.8.1
-
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3nFwFq42u0865ymPUJ0n6C0VMj8El2alOmUoO9SxFDGJPMKFEoxPFO3o1nCkbfKpwRmidx9WDH9rox/sfsBzZLysaKgooi5T2E34T0NTk6qWzT6zFxansi3IDygDhCugM1T8kr5hQlYS6EHp5DmWsRf+vN416eWSiDN2xvCa4IQTRn4d37p9lhN/oKpFKynJZM0gv/JEGGorPLX1BFKzbtcBIcKkmSMJ3RdK/5W1eVl/isL4va6XXEIh/wZaHttuiE+UiBSOvpFMuL3cR5kMjmFuGwqz8nLMzFg+8IUI5cvc78Ix54ketZExAdl8fU60EwIHRKKJKdxFS+WOM/3iH saem@thinky
-ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAwoRD1kwvIDOurW9dENSORjkTDCiJpRxKwSOYug7fcXFlWU6SG5tdAl+P9Jmon5p1lMLW4s9rRnTyp5ZTPdAUWEQeJtkZgwkGOfJqBIp2FUrCK19ZRS9Tqouz4OFFdrAhRa2/Fg4aweOhWRMbSYnosYs8hr8ct2xun4uabYSD7ZCz5VoW8DhUBPAc0Vp3Glj4ziPWe3bjIkbT295SB47Ab8HeKMuGQjaWyTdhWAKmrEnjrInoMZDmoCLhn/Pc5mIvg594BBWwTSImxv65hicy8DDNjPWmh8IqtS3iH09udk21mNeYQYiIAfiqR8+gaK7oT6gz3mXNLWv5dv1w+6CNSw== contract@MacBook-2.local
+export WIKIPEDIA_VOLUME_ID=vol-3c223155
-##########
-#!/bin/bash
-PATTERN=$1; OUTFILE_PREFIX=$2
-FILES=pagecounts-*gz
-
-# 'xargs -P 4' spawns a maximum of four procs at a time
-echo $FILES | xargs -P 4 -n 1 ./process_1.sh "$PATTERN" "$OUTFILE_PREFIX" # $F
-##########
-
ec2-create-volume --size 10 --availability-zone us-east-1a
# grab volume ID
export VOLUME_ID=vol-5ccfdc35
@@ -47,12 +35,13 @@ ec2-attach-volume $VOLUME_ID -i $INSTANCE_ID -d /dev/sdh
# mount and dd it
# http://aws-musings.com/how-to-create-an-ebs-image-from-an-existing-ec2-instance/
# http://developer.amazonwebservices.com/connect/thread.jspa?threadID=39358&start=15&tstart=0
+# http://shlomoswidler.com/2009/07/boot-ec2-instances-from-ebs.html
ec2-detach-volume $VOLUME_ID -i $INSTANCE_ID
ec2reg -s snap-9bb90ef1 -a x86_64 -d 'gentoo_2010.1' -n 'tavis_gentoo_2010.1'
#ami-8a4bbfe3
-
+*** initial timing notes
################################################################################
ip-10-100-75-21 ~ # time ./process_parallel.sh
View
@@ -2,120 +2,164 @@
import os
import os.path
import string
+from time import sleep
+from tempfile import mkstemp
import boto
-# from urllib2 import urlopen
from fabric.api import (env
, run
+ , cd
+ , settings
#, local
+ , put
#, hosts
- #, cd
#, hide
)
# from fabric.context_managers import prefix
# from fabric.contrib.console import confirm
-class lcd(object):
- """A fabric context manager for changing the local working
- directory.
- """
-
- def __init__(self, path):
- self.path = os.path.expanduser(os.path.expandvars(path))
- self.orig_path = os.getcwd()
-
- def __enter__(self):
- self.orig_path = os.getcwd()
- os.chdir(self.path)
-
- def __exit__(self, _type, _value, _traceback):
- os.chdir(self.orig_path)
-
-class HostContext(object):
- """A fabric context manager that explicitly sets the remote host
- that `run` (etc.) should operate on.
- """
- def __init__(self, host_string):
- self.host_string = host_string
- if '@' in host_string:
- self.user = host_string.split('@')[0]
- else:
- self.user = env.user
-
- def __enter__(self):
- # pylint: disable-msg=W0201
- self.orig_host_string = env.host_string
- self.orig_user = env.user
- self.orig_host = env.host
- env.host_string = self.host_string
- env.user = self.user
-
- def __exit__(self, _type, _value, _traceback):
- env.host_string = self.orig_host_string
- env.host = self.orig_host
- env.user = self.orig_user
-
def InstanceContext(instance, username='root'):
- return HostContext('%s@%s'%(username, instance.ip_address))
+ return settings(host=instance.ip_address,
+ user=username,
+ host_string='%s@%s'%(username, instance.ip_address))
-################################################################################
def gen_dev_names():
for l in string.letters[5:]:
yield '/dev/sd%s'%l
+def create_udev_rule_for_ebs_vol(device, mountpoint):
+ if device.startswith('/dev/'):
+ device = device[len('/dev/'):]
+ assert len(device)==3
+ assert device.startswith('sd')
+ tmp_fd, tmp_path = mkstemp()
+ tmp_file = os.fdopen(tmp_fd, 'w')
+ tmp_file.write(
+ '''\
+SUBSYSTEM!="block", GOTO="%(dev)s_end"
+KERNEL!="%(dev)s", GOTO="%(dev)s_end"
+
+ACTION=="add", RUN+="/bin/mkdir -p %(mountpoint)s"
+ACTION=="add", RUN+="/bin/mount /dev/sdf %(mountpoint)s"
+
+ACTION=="remove", RUN+="/bin/umount %(mountpoint)s"
+ACTION=="remove", RUN+="/bin/rmdir %(mountpoint)s"
+
+LABEL="%(dev)s_end"
+'''%{'dev':device, 'mountpoint':mountpoint})
+ tmp_file.close()
+ put(tmp_path, '/etc/udev/rules.d/86-%s.rules'%device)
+ os.unlink(tmp_path)
+
+
+def attach_and_mount_ebs_volume(instance, volume_id, device, mount_point):
+ create_udev_rule_for_ebs_vol(device, mount_point)
+ ec2_conn.attach_volume(volume_id=volume_id,
+ instance_id=instance.id,
+ device=device)
+
################################################################################
ec2_conn = boto.connect_ec2()
-
-def main():
-
- wiki_data_snapshot_id = "snap-753dfc1c"
- zone = 'us-east-1a'
-
- base_image_id = "ami-5a43a933"
- base_image = ec2_conn.get_image(base_image_id)
-
- base_instance = base_image.run(
- instance_type='m1.large',
+local_working_dir = '~/nosql_summer_hackathon/'
+wikistats_snapshot_id = "snap-753dfc1c"
+zone = 'us-east-1a'
+instances = []
+
+base_image_id = "ami-5a43a933"
+base_image = ec2_conn.get_image(base_image_id)
+
+key_name = 'nosql-riak-keypair'
+def setup_keypair(key_name=key_name):
+ keyfile_path = os.path.join(local_working_dir, '%s.pem'%key_name)
+ if not os.path.exists(keyfile_path):
+ kp = ec2_conn.create_key_pair(key_name)
+ open(keyfile_path,'w').write(kp.material)
+ ## @@TR: finish ...
+
+def get_running_instances():
+ return ec2_conn.get_all_instances()[0].instances
+
+def create_instance(
+ instance_type='m1.large',
+ key_name='nosql-riak-keypair'):
+ instance = base_image.run(
+ instance_type=instance_type,
key_name='nosql-riak-keypair',
security_groups=['default'],
- placement=zone
- ).instances[0]
-
-
- base_instance_devs = gen_dev_names()
-
- working_volume_id = "vol-ce8b94a7"
- working_volume_dev_name = base_instance_devs.next()
- ec2_conn.attach_volume(volume_id=working_volume_id,
- instance_id=base_instance.id,
- device=working_volume_dev_name)
-
- wikidata_vol = ec2_conn.create_volume(
- size=400, snapshot=wiki_data_snapshot_id, zone=zone)
- wikidata_vol_dev_name = base_instance_devs.next()
-
- ec2_conn.attach_volume(volume_id=wikidata_vol.id,
- instance_id=base_instance.id,
- device=wikidata_vol_dev_name)
-
+ placement=zone).instances[0]
+ instances.append(instance)
+ # @@TR: should add a timeout
+ while instance.state == 'pending':
+ sleep(2)
+ instance.update()
+ return instance
+
+def bootstrap_instance(instance):
+ bootstrap_gentoo(instance)
+ attach_nosql_summer_volumes(instance)
+ install_erlang_and_riak(instance)
+ return instance
+
+def bootstrap_gentoo(instance):
+ with InstanceContext(instance):
+ run("""
+ ( grep 'alias l=' /etc/bash/bashrc > /dev/null ) ||\
+ echo 'alias l=\"ls -alh --color\"' >> /etc/bash/bashrc
+ """)
+ run("emerge -u eix")
+ run("eix-sync")
+ run("emerge -u bash")
+ run("emerge -u erlang mercurial git htop screen sysstat lsof")
- if False:
- downloads_vol = ec2_conn.create_volume(size=5, zone=zone)
- wiki_downloads_vol_id = downloads_vol.id
- else:
- wiki_downloads_vol_id = 'vol-6e021807'
- wiki_downloads_vol_dev_name = base_instance_devs.next()
- ec2_conn.attach_volume(volume_id=wiki_downloads_vol_id,
- instance_id=base_instance.id,
- device=wiki_downloads_vol_dev_name)
+def attach_nosql_summer_volumes(instance,
+ attach_downloads=False,
+ attach_wikistats=False):
+ disk_devs = gen_dev_names()
+ with InstanceContext(instance):
+ attach_and_mount_ebs_volume(instance=instance,
+ volume_id="vol-ce8b94a7",
+ device=disk_devs.next(),
+ mount_point='/mnt/working')
+
+ if attach_downloads:
+ if False:
+ downloads_vol = ec2_conn.create_volume(size=5, zone=zone)
+ wiki_downloads_vol_id = downloads_vol.id
+ else:
+ wiki_downloads_vol_id = 'vol-6e021807'
+ wiki_downloads_vol_dev_name = disk_devs.next()
+ create_udev_rule_for_ebs_vol(
+ wiki_downloads_vol_dev_name, '/mnt/downloads')
+ ec2_conn.attach_volume(volume_id=wiki_downloads_vol_id,
+ instance_id=instance.id,
+ device=wiki_downloads_vol_dev_name)
+
+ if attach_wikistats:
+ wikistats_vol = ec2_conn.create_volume(
+ size=400, snapshot=wikistats_snapshot_id, zone=zone)
+ wikistats_vol_dev_name = disk_devs.next()
+ create_udev_rule_for_ebs_vol(
+ wikistats_vol_dev_name, "/mnt/wikistats")
+ ec2_conn.attach_volume(volume_id=wikistats_vol.id,
+ instance_id=instance.id,
+ device=wikistats_vol_dev_name)
+
+#riak_install_dir = '/mnt/riak-0.13.0'
+def install_erlang_and_riak(instance):
+ with InstanceContext(instance):
+ #run("hg clone http://hg.basho.com/riak")
+ run("""[[ -e riak-0.13.0.tar.gz ]] || \
+ curl -O http://downloads.basho.com/riak/riak-0.13/riak-0.13.0.tar.gz""")
+ run("[[ -e riak-0.13.0 ]] || tar zxvf riak-0.13.0.tar.gz")
+ run("cd riak-0.13.0/ && make all")
-def update_gentoo(instance):
+def create_local_riak_cluster(instance):
with InstanceContext(instance):
- run("emerge eix")
- run("eix-sync")
+ run("cd riak-0.13.0/ && ( [[ -e dev ]] || make devrel )")
################################################################################
+# base_instance = get_running_instances()[0]

0 comments on commit 680ef97

Please sign in to comment.