In [4]:
import pywren
import numpy as np
from boto.s3.connection import S3Connection
from boto.s3.key import Key
import pywren.storage as storage
import boto3
import pickle 
import sys
import time
import os
import redis
from rediscluster import StrictRedisCluster

In [5]:
# configuration 
num_workers = 3
bucket_name = 'terasort-yawen'

# the file to be sorted should be partitioned into "num_worker" number of files 
# as inputs to the map stage; 
# specify directory that contains files to be sorted: input1, input2, etc. 
path_local = 'input_files/' 
path = "input_3_3M/"
file_name = 'input'
concat_file_name = path_local + path + file_name

In [6]:
s3_client = boto3.client('s3')

# upload n input files to S3 (inputs to the mapper stage)
for i in range(num_workers):
    result = s3_client.put_object(
        Bucket = bucket_name,
        Body = open(concat_file_name + str(i), 'rb'),
        Key = path + file_name + str(i)
    )

In [18]:
## sample[i − 1] <= key < sample[i] is sent to reduce i
def get_sample_keys(file_path, num_workers):
    f = open(file_path, "r")
    lines = f.readlines()

    key_list = []

    for line in lines: 
        data = line.split("  ")
        key = data[0]
        key_list.append(key)

    key_list.sort()
    length = len(key_list)
    print "num records: " + str(length)
    n = num_workers
    key_range = length/n
    index = 0
    sample_key_list = []
    for i in range(1, n+1): 
        if (i==n):
            index = length -1
            sample_key_list.append(key_list[length-1])
        else:
            index += key_range
            sample_key_list.append(key_list[index])
        # print index
    
    return sample_key_list

#get_sample_keys('input_files/input0', num_workers)

In [29]:
range(1,3)

[1, 2]

In [76]:
# partition stage: partition input data into n groups 
def mapper(data):
    id = data[0]
    n = num_workers = data[1]
    bucket_name = data[2]
    sample_keys = data[3]
    path = data[4]

    t0=time.time()
    #[s3] read from input file: input<id> 
    s3 = boto3.resource('s3')
    key = path + 'input' + str(id)
    file_local = '/tmp/input_tmp'
    s3.Bucket(bucket_name).download_file(key, file_local)
    t1=time.time()    
        
    #partition 
    with open(file_local, "r") as f: 
        lines = f.readlines() #each line contains a 100B record
    os.remove(file_local)
    p_list = [[] for x in xrange(n)]  #list of n partitions
    for line in lines:
        key = line[:10]
        index = 0
        while key > sample_keys[index]:
            index += 1
        p_list[index].append(line)
    t2=time.time()
    '''
    #write to output files in tmp: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    f_list = [] #output file list
    #redis_client = redis.Redis(host="rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", port=6379)
    startup_nodes = [{"host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379"}]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes, decode_responses=True, skip_full_coverage_check=True)
    
    for i in range(n):
        key = 'tmp/shuffle' + str(id) + str(i)
        result = redis_client.set(key, pickle.dumps(p_list[i]))
    '''

    t3 = time.time()
    
    #return time spent (in sec) writing intermediate files 
    return [t1-t0, t2-t1, t3-t2] #read input, compute, write shuffle 

#mapper([0, num_workers, bucket_name, sample_keys, path])  

In [78]:
# sort stage: merge n sets of data & sort 
def reducer(data):
    id = data[0]
    n = num_workers = data[1]
    bucket_name = data[2]
    
    #read from input file in tmp: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    t0 = time.time()
    #redis_client = redis.Redis(host="rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", port=6379)
    startup_nodes = [{"host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com", "port": "6379"}]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes, decode_responses=True, skip_full_coverage_check=True)

    lines_list = []
    for i in range(n):
        key = 'tmp/shuffle'+ str(i) + str(id)
        body = redis_client.get(key)
        if body == None:
            return -1
        lines = pickle.loads(body)
        lines_list.append(lines)
    t1 = time.time()
    
    #merge & sort 
    merged_lines = sum(lines_list, [])
    tuples_list = []
    for line in merged_lines:
        tuples_list.append((line[:10], line[12:]))
    
    sorted_tuples_list = sorted(tuples_list, key=lambda x: x[0])
    t2=time.time()
    
    #[s3] write to output file: output<id>  
    with open('/tmp/sorted_output', 'w+') as f:
        for t in sorted_tuples_list: 
            f.write(t[0]+'  '+t[1])
    
    s3_client = boto3.client('s3')
    result = s3_client.put_object(
        Bucket = bucket_name,
        Body = open('/tmp/sorted_output', 'rb'),
        Key = 'output/sorted_output' + str(id)
    )
    t3=time.time()
    
    #return time (in sec) spent reading intermediate files
    return [t1-t0, t2-t1, t3-t2] #read shuffle, compute, write output 

#reducer([0, num_workers, bucket_name])

In [79]:
import subprocess

def final_reducer(data):
    import os
    
    t0=time.time()
    n = data[0]
    bucket_name = data[1]
    
    s3 = boto3.resource('s3')

    for i in range(n):
        key = 'output/sorted_output'+ str(i)
        file_local = 'output/sorted_output'+ str(i)
        s3.Bucket(bucket_name).download_file(key, file_local)        
    
    # concatenate all files 
    subprocess.call("cd output && cat sorted_output* > sorted_output", shell=True) 
    
    t1=time.time()
    return (t1-t0)

In [80]:
wrenexec = pywren.default_executor()

In [81]:
map_data_list = []
reduce_data_list = []

sample_keys = get_sample_keys(concat_file_name, num_workers)

for i in range(num_workers):
    map_data_list.append([i, num_workers, bucket_name, sample_keys, path])
    reduce_data_list.append([i, num_workers, bucket_name])


num records: 30000


In [82]:
futures = wrenexec.map(mapper, map_data_list)

In [83]:
results_map = pywren.get_all_results(futures)

In [84]:
futures = wrenexec.map(reducer, reduce_data_list)

In [85]:
results_reduce = pywren.get_all_results(futures)

In [86]:
t_io = []
t_comp = []
t_inter = []
for r in results_map:
    t_io.append(r[0])
    t_comp.append(r[1])
    t_inter.append(r[2])
print "map:"
print "read input: " + str(sum(t_io) / len(t_io))
print "compute: " + str(sum(t_comp) / len(t_comp))
print "write inter: " + str(sum(t_inter) / len(t_inter))

# returns time spent (in sec) writing intermediate data in each mapper 
results_map

map:
read input: 0.323478062948
compute: 0.00417224566142
write inter: 1.03314717611e-06


[[0.23617792129516602, 0.003701925277709961, 1.1920928955078125e-06],
 [0.348499059677124, 0.004467964172363281, 9.5367431640625e-07],
 [0.3857572078704834, 0.0043468475341796875, 9.5367431640625e-07]]

In [87]:
# returns time spent (in sec) reading intermediate data in each reducer 

t_io = []
t_comp = []
t_inter = []
for r in results_reduce:
    t_io.append(r[2])
    t_comp.append(r[1])
    t_inter.append(r[0])
print "reduce:"
print "read inter: " + str(sum(t_inter) / len(t_inter))
print "compute: " + str(sum(t_comp) / len(t_comp))
print "write output: " + str(sum(t_io) / len(t_io))
results_reduce

reduce:
read inter: 0.155095656713
compute: 0.0137510299683
write output: 0.361885945002


[[0.17439794540405273, 0.014990091323852539, 0.5106229782104492],
 [0.07297301292419434, 0.009698152542114258, 0.21471381187438965],
 [0.21791601181030273, 0.01656484603881836, 0.360321044921875]]

In [88]:
# final stage: concatenate outputs from the reduce/sort stage to form a single sorted output file
final_reducer([num_workers,bucket_name])


0.941709041595459