In [240]:
import pywren
import numpy as np
from boto.s3.connection import S3Connection
from boto.s3.key import Key
import pywren.storage as storage
import boto3
import pickle 
import sys
from redis import Redis

In [255]:
# configuration 
num_workers = 10 #20 #100
bucket_name = 'terasort-yawen-lambda'

# the file to be sorted should be partitioned into "num_worker" number of files 
# as inputs to the map stage; 
# specify directory that contains files to be sorted: input1, input2, etc. 
file_path_local = 'inputs/input_10_30M/' 
file_name = 'input'
concat_file_name = 'inputs/input_10_30M/input'

In [242]:
import boto3
#boto3 already configured 

s3_client = boto3.client('s3')

# upload n input files to S3 (inputs to the mapper stage)
for i in range(num_workers):
    result = s3_client.put_object(
        Bucket = bucket_name,
        Body = open(file_path_local + file_name + str(i), 'rb'),
        Key = file_name+str(i)
    )

In [243]:
## sample[i − 1] <= key < sample[i] is sent to reduce i
def get_sample_keys(file_path, num_workers):
    ## Open the file with read only permit
    f = open(file_path, "r")

    ## use readlines to read all lines in the file
    ## The variable "lines" is a list containing all lines
    lines = f.readlines()

    key_list = []

    for line in lines: 
        data = line.split("  ")
        key = data[0]
        key_list.append(key)

    key_list.sort()
    length = len(key_list)
    #print "num records: " + str(length)
    n = num_workers
    key_range = length/n
    index = 0
    sample_key_list = []
    for i in range(1, n+1): #1,2,3
        if (i==n):
            index = length -1
            sample_key_list.append(key_list[length-1])
        else:
            index += key_range
            sample_key_list.append(key_list[index])
        #print index
    
    return sample_key_list

#sample_keys = get_sample_keys('input_files/input0', num_workers)
#sample_keys

In [244]:
import time 

# partition stage: partition input data into n groups 
def mapper(data):
    import os
    from redis import Redis

    
    id = data[0]
    n = num_workers = data[1]
    bucket_name = data[2]
    sample_keys = data[3]

    t0=time.time()
    #[s3] read from input file: input<id> 
    s3 = boto3.resource('s3')
    key = 'input'+str(id)
    file_local = '/tmp/input_tmp'
    s3.Bucket(bucket_name).download_file(key, file_local)
    t1=time.time()    
        
    #partition 
    with open(file_local, "r") as f: 
        lines = f.readlines() #each line contains a 100B record
    os.remove(file_local)
    p_list = [[] for x in xrange(n)]  #list of n partitions
    for line in lines:
        data = line.split("  ")
        '''
        if len(data) != 3:
            data[0] = data[0]+data[1]
            data[1] = data[2]
            data[2] = data[3]
        '''
        index = 0
        while data[0] > sample_keys[index]:
            index += 1
        p_list[index].append(line)
    t2=time.time()
    
    
    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    f_list = [] #output file list
    
    gateway_ip = '54.212.247.168'
    gateway_port = '8888'
    bind_port = 40000+id 
    redis = Redis(gateway_ip, int(gateway_port), bind_port=int(bind_port))
    for i in range(n):
        key = 'shuffle' + str(id) + str(i)
        result = redis.set(key, pickle.dumps(p_list[i]))
    t3 = time.time()
    
    #return time spent (in sec) writing intermediate files 
    return [t1-t0, t2-t1, t3-t2] #read input, compute, write shuffle 

#mapper([2, num_workers, bucket_name, sample_keys])  

In [245]:
import time 

# sort stage: merge n sets of data & sort 
def reducer(data):
    import os
    from redis import Redis
    
    id = data[0]
    n = num_workers = data[1]
    bucket_name = data[2]
    
    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    t0 = time.time()
    gateway_ip = '54.212.247.168'
    gateway_port = '8888'
    bind_port = 50000+id
    redis = Redis(gateway_ip, int(gateway_port), bind_port=int(bind_port))
    lines_list = []
    for i in range(n):
        key = 'shuffle'+ str(0) + str(id)
        body = redis.get(key)
        if body == None:
            return -1
        lines = pickle.loads(body)
        lines_list.append(lines)
    t1 = time.time()
    
    #merge & sort 
    merged_lines = sum(lines_list, [])
    tuples_list = []
    for line in merged_lines:
        data = line.split('  ')
        tuples_list.append((data[0], data[1]+'  '+data[2]))
    
    sorted_tuples_list = sorted(tuples_list, key=lambda x: x[0])
    t2=time.time()
    
    #[s3] write to output file: output<id>  
    t_output0=time.time()
    s3_client = boto3.client('s3')
    file_name = 'sorted_output' + str(id)
    result = s3_client.put_object(
        Bucket = bucket_name,
        Body = pickle.dumps(sorted_tuples_list),
        Key = file_name
    )
    t3=time.time()
    
    #return time (in sec) spent reading intermediate files
    return [t1-t0, t2-t1, t3-t2] #read shuffle, compute, write output 

#reducer([0, num_workers, bucket_name])

In [246]:
def final_reducer(data):
    import os
    
    t0=time.time()
    n = data[0]
    bucket_name = data[1]
    
    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    s3_client = boto3.client('s3')
    tuples_list = []
    for i in range(n):
        key = 'sorted_output'+ str(i)
        body = s3_client.get_object(Bucket=bucket_name, Key=key)['Body'].read()
        tuples_list += pickle.loads(body)
        
    with open('./sorted_output', 'w') as f:
        for data in tuples_list: 
            f.write(str(data[0])+'  '+str(data[1]))
    t1=time.time()
    return (t1-t0)

In [247]:
wrenexec = pywren.default_executor()

In [248]:
map_data_list = []
reduce_data_list = []

sample_keys = get_sample_keys(concat_file_name, num_workers)

for i in range(num_workers):
    map_data_list.append([i, num_workers, bucket_name, sample_keys])
    reduce_data_list.append([i, num_workers, bucket_name])


In [249]:
futures = wrenexec.map(mapper, map_data_list)

In [250]:
results_map = pywren.get_all_results(futures)

In [251]:
futures = wrenexec.map(reducer, reduce_data_list)

In [252]:
results_reduce = pywren.get_all_results(futures)

In [253]:
t_io = []
t_comp = []
t_inter = []
for r in results_map:
    t_io.append(r[0])
    t_comp.append(r[1])
    t_inter.append(r[2])
print "map:"
print "read input: " + str(sum(t_io) / len(t_io))
print "compute: " + str(sum(t_comp) / len(t_comp))
print "write inter: " + str(sum(t_inter) / len(t_inter))

# returns time spent (in sec) writing intermediate data in each mapper 
results_map

map:
read input: 0.21368598938
compute: 0.0870207150777
write inter: 3.1236846447


[[0.21266603469848633, 0.09230589866638184, 3.2602360248565674],
 [0.23511099815368652, 0.07327914237976074, 2.75010085105896],
 [0.19328093528747559, 0.09547710418701172, 3.3607170581817627]]

In [254]:
# returns time spent (in sec) reading intermediate data in each reducer 

t_io = []
t_comp = []
t_inter = []
for r in results_reduce:
    t_io.append(r[2])
    t_comp.append(r[1])
    t_inter.append(r[0])
print "reduce:"
print "read inter: " + str(sum(t_inter) / len(t_inter))
print "compute: " + str(sum(t_comp) / len(t_comp))
print "write output: " + str(sum(t_io) / len(t_io))
results_reduce

reduce:
read inter: 3.55727163951
compute: 0.679601669312
write output: 7.11767133077


[[4.683604001998901, 1.3008201122283936, 7.160967826843262],
 [1.5085079669952393, 0.29518890380859375, 7.957313060760498],
 [4.479702949523926, 0.442795991897583, 6.2347331047058105]]

In [100]:
# final stage: concatenate outputs from the reduce/sort stage to form a single sorted output file
#final_reducer([num_workers,bucket_name])
