In [101]:
# Specific extraction of information from different formats

# This extracts the latency in milliseconds from the ping's result format
# 64 bytes from 13.66.225.134: icmp_seq=1 ttl=47 time=36.2 ms => 36.2
def get_ping_list(file_name):
    l = []
    with open(file_name) as f:
        for line in f:
            try:
                l.append(float(line.split()[6].split("=")[1]))
            except:
                continue
        if len(l) < 1000:
            print file_name # to catch any deficiencies
        return l
 
# This extracts the latency from files that just have a number per row
def get_integer_list(file_name):
    l = []
    with open(file_name) as f:
        for line in f:
            try:
                l.append(int(line))
            except:
                continue
    return l


# This extracts all latency breakdown results from the giza_latency_raw directory
# returns a map {key, [list of latency for the key]}
def get_giza_latency_raw_dir_map(file_name):
    from collections import defaultdict
    d = defaultdict(list)
    with open(file_name) as f:
        for line in f:
            l = line.split()
            if len(l) == 3:
                key = l[0] + l[1]
                d[key].append(int(l[2]))
            elif len(l) == 4:
                key = l[0] + l[1] + l[2]
                d[key].append(int(l[3]))
            else:
                if len(l) == 0 or len(l) == 1:
                    continue

                d[l[0]].append(int(l[1]))
    return d

# helper mapping for data extraction within giza_latency_raw directory
def get_dc_ip_mappings():
    dic = {}
    dic['south-central'] = '13.65.92.139'
    dic['west1'] = "13.93.236.162"
    dic['central'] = "localhost"
    dic['west-central'] = '52.161.28.134'
    dic['north-central'] = '157.56.29.194'
    dic['east1'] = '191.237.41.69'
    dic['east2'] = '13.68.110.92'
    dic['us-south-central'] = '13.65.92.139'
    dic['us-west1'] = "13.93.236.162"
    dic['us-central'] = "localhost"
    dic['asia-japan-east'] = "13.78.83.9"
    dic['eu-north-europe'] = '52.178.201.184'
    dic['eu-uk-west'] = '51.141.11.143'
    dic['asia-japan-west'] = '104.214.146.200'
    
    #     dic['japan-east'] = "13.78.83.9"
#     dic['north-europe'] = '52.178.201.184'
    return dic

# helper function to get the configuration information
def get_configurations(configuration):
    configuration_names = ['2-1-us', '2-1-world', '6-1-us', '6-1-world']
    configurations = [["central", "south-central", "west1"],
                      ["us-central", "eu-north-europe", "asia-japan-east"],
                      ["central", "west-central", "south-central",
                       "north-central", "east1", "east2", "west1"],
                      ["us-central", "us-south-central", "us-west1",
                       "eu-north-europe", "eu-uk-west", "asia-japan-east",
                       "asia-japan-west"]]
    return (configuration_names[configuration], configurations[configuration])


# This is used to extract the specific thrift latency from the dc to all other dc's storage within a giza experiment
# results are limited to:
#   - central to [central, west1, south central]
#   - central to [central, japan east, north europe]
#   - central to [centra, west central, north central, south central, east1, east2, west1]
#   - central to [central, west1, south central, japan east, japan west, north europe, uk west]
# input:
#    configuration :
#      0: us-2-1
#      1: world-2-1
#      2: us-6-1
#      3: world-6-1
#    size: 256kb, 1mb, 4mb
#    op: put, get
def get_thrift_storage_lists(configuration, size, op):
    # mapping info
    dic = get_dc_ip_mappings()
    configuration_name, configurations = get_configurations(configuration)
    #configuration_names = ['2-1-us', '2-1-world', '6-1-us', '6-1-world']
    #configurations = [["central", "south-central", "west1"],
#                       ["us-central", "eu-north-europe", "asia-japan-east"],
#                       ["central", "west-central", "south-central",
#                        "north-central", "east1", "east2", "west1"],
#                       ["us-central", "us-south-central", "us-west1",
#                        "eu-north-europe", "eu-uk-west", "asia-japan-east",
#                        "asia-japan-west"]]
    if op == 'get':
        thrift_op = "GET_Thrift_Storage_"
        azure_op = "Get_Azure_"
    
    else:
        thrift_op = "Put_Thrift_Storage_"
        azure_op = "Put_Azure_"
    thrift_file_name = '/'.join(['giza_latency_raw', configuration_name, size, op, 'giza_trace.log'])
    thrift_file_map = get_giza_latency_raw_dir_map(thrift_file_name)
    thrift_list = []
    azure_list = []
    for dc in configurations:
        azure_file_name = dc + '_storage_server.log'
        azure_file_name = '/'.join(['giza_latency_raw', configuration_name, size, op, azure_file_name])
        azure_file_map = get_giza_latency_raw_dir_map(azure_file_name)
        if configuration == 0 or configuration == 2:
            azure_list.append(azure_file_map[azure_op + 'us-' +  dc])
        else:
            azure_list.append(azure_file_map[azure_op + dc])
        thrift_list.append(thrift_file_map[thrift_op + dic[dc]])
    return (thrift_list, azure_list)

In [157]:
# Specifically for Data Analysis

# get 2d arrar (matrix graph) of the different dc access. Each element is a list of result that can 
# be summarized
# returns a map where key is a tuple
def get_2d_data(from_dcs, to_dcs, file_format):
    d = {}
    for dc1 in from_dcs:
        for dc2 in to_dcs:
            d[(dc1, dc2)] = get_ping_list(file_format.format(dc1, dc2))
    return d

# reducer of the results returned by get_2d_data
# can be map or vector
def do_2d_analysis(data, data_points, reducer, *args):
    dict
    if type(data) is dict:
        d = {}
        for key in data:
            if len(data[key]) == data_points:
                d[key] = reducer(data[key], *args)
            else:
                d[key] = -1
        return d
    if type(data) is list:
        d = []
        for datum in data:
            d.append(reducer(datum))
        return d

# convert any analysis data into a table
def make_2d_table(from_dcs, to_dcs, data, file_name):
    import numpy as np
    import pandas as pd
    table = np.zeros((len(from_dcs), len(to_dcs)), dtype=np.int)
    for i in range(len(from_dcs)):
        for j in range(len(to_dcs)):
            table[i][j] = data[(from_dcs[i], to_dcs[j])]
    table = pd.DataFrame(table, index=from_dcs, columns=to_dcs)
    table.to_csv(file_name, index=True, header=True, sep=',')

# given data points, calculate different percentiles
def get_percentiles(data, percentiles):
    l = [] 
    for p in percentiles:
        import numpy as np
        l.append(np.percentile(data, p))
    return l

# create bargraph and allows stacking with extra data
# input has to be in the form of [[raw_data1], [raw_data2]. etc]
def bar_graph_with_error(data1, data2, data_names, x_names, y_label, fname, y_range, no_mean = True):
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    f = plt.figure()
    ind = np.arange(len(data1))
    width = 0.25
    if no_mean:
        data1_mean = [np.mean(x)/1000 for x in data1]
        data2_mean = [np.mean(x)/1000 for x in data2]
    
#     data1_err = [np.std(x)/1000 for x in data1]    
#     data2_err = [np.std(x)/1000 for x in data2]
    p1 = plt.bar(ind, data1_mean, width, color='r') #, yerr=data1_err)
    if data2 != '':
        p2 = plt.bar(ind, data2_mean, width, color='y')#, bottom = data1_mean) #, yerr=data2_err)
        plt.legend((p1[0], p2[0]), data_names)
    plt.xticks(ind + width/2., x_names)
    plt.yticks(np.arange(0, y_range, 50))
    plt.ylabel(y_label)
#     plt.legend((p1[0], p2[0]), data_names)
#     plt.show()
    pp = PdfPages(fname + '.pdf')
    pp.savefig(f)
    plt.clf()
    pp.close()

In [114]:
# Get the ping latencies
import numpy as np
dcs = ['central', 'west-central', 'north-central', 'east1', 'east2', 'south-central', 'west1', 'west2', 'north-europe', 'uk-west', 'japan-east', 'japan-west']
file_format = 'ping_latency_raw/{}-to-{}.log'
data = get_2d_data(dcs, dcs, file_format)
percentiles = [10, 50, 75, 95]
for p in percentiles:
    data_summary = do_2d_analysis(data, 1000, np.percentile, p)
    make_2d_table(dcs, dcs, data_summary, 'ping_latency_readable/ping_' + str(p) + '.csv')

ping_latency_raw/west-central-to-north-europe.log
ping_latency_raw/north-europe-to-west-central.log


In [158]:
# Get the tunneling latency
import numpy as np
from_dcs = ['central']
configurations = [0, 2]
sizes = ['256kb', '1mb', '4mb']
ops = ['put', 'get']
cur_dir = 'azure_storage_latency_graph/tunneling'
for configuration in configurations:
    for size in sizes:
        for op in ops:
            config, to_dcs = get_configurations(configuration)
            thrift_latency, azure_latency = get_thrift_storage_lists(configuration, size, op)
#             transfer_latency = []
#             for i in range(len(thrift_latency)):
#                 transfer_latency.append(np.subtract(thrift_latency[i], azure_latency[i]))
            bar_graph_with_error(thrift_latency, azure_latency, ('Transfer', 'Azure'), to_dcs, 'Latency (ms)', '{}/{}-{}-{}'.format(cur_dir, config, op, size), 301)
configurations = [1, 3]
sizes = ['256kb', '1mb', '4mb']
ops = ['put', 'get']
cur_dir = 'azure_storage_latency_graph/tunneling'
for configuration in configurations:
    for size in sizes:
        for op in ops:
            config, to_dcs = get_configurations(configuration)
            thrift_latency, azure_latency = get_thrift_storage_lists(configuration, size, op)
#             transfer_latency = []
#             for i in range(len(thrift_latency)):
#                 transfer_latency.append(np.subtract(thrift_latency[i], azure_latency[i]))
            bar_graph_with_error(thrift_latency, azure_latency, ('Transfer', 'Azure'), to_dcs, 'Latency (ms)', '{}/{}-{}-{}'.format(cur_dir, config, op, size), 401)


In [140]:
size = 128
cur_dir = 'azure_storage_latency_graph/no_tunneling'
to_dc = ['central', 'south-central', 'west1']
config, to_dcs = get_configurations(0)
from_dc = 'central'
storage_latency = []
file_format = 'azure_storage_latency_raw/{}/azure_storage_get_{}kb.txt_{}_to_{}.log'
for dc in to_dc:
    storage_latency.append(get_giza_latency_raw_dir_map(file_format.format(from_dc, size, from_dc, dc))['main'])
bar_graph_with_error(storage_latency, '', '', to_dc, 'Latency (ms)', '{}/{}-{}-{}'.format(cur_dir, config, 'put', '256kb'), 301)

In [147]:
from collections import defaultdict
lf = []
lt = []
with open ('azure_storage_latency_raw/central/blob_ss_h.log', 'r') as f:
    for line in f:
        lf.append(line)
with open ('azure_storage_latency_raw/central/blob_ss_t.log', 'r') as f:
    for line in f:
        lt.append(line)
lf.sort()
lt.sort() 
current_string = ""
max_counter = 0
min_counter = 0
storages = ['blob.by4prdstr03a.store.core.windows.net.http',
           'blob.dm5prdstr02a.store.core.windows.net.http',
           'blob.sn4prdstr03a.store.core.windows.net.http',
           'blob.cy4prdstr01a.store.core.windows.net.http',
           'blob.db4prdstr01a.store.core.windows.net.http']
s_map = {
    'blob.by4prdstr03a.store.core.windows.net.http': 'west1',
    'blob.dm5prdstr02a.store.core.windows.net.http': 'central',
    'blob.sn4prdstr03a.store.core.windows.net.http': 'south-central',
    'blob.cy4prdstr01a.store.core.windows.net.http': 'japan-east',
    'blob.db4prdstr01a.store.core.windows.net.http': 'north-europe'
}
d = defaultdict(list)
ed = defaultdict(list)
for i in range(len(lf)):
    lfs = lf[i].split()
    lfs[4] = lfs[4][:-1]
    lts = lt[i].split()
    before = 0
    after = 0
    if lfs[4] in storages and lts[2] in storages:
        before = datetime.strptime(lfs[0], "%H:%M:%S.%f")
        after = datetime.strptime(lts[0], "%H:%M:%S.%f")
        if current_string == "":
            current_string = lfs[4]
            d[current_string + str(max_counter)].append(after-before)
            ed[s_map[current_string] + str(max_counter)].append(after-before)
        else:
            if current_string != lfs[4]:
                if min_counter == 4:
                    min_counter = 0
                    max_counter += 1
                else:
                    min_counter += 1
                current_string = lfs[4]
                d[current_string + str(max_counter)].append(after-before)
                ed[s_map[current_string] + str(max_counter)].append(after-before)
            else:
                d[current_string + str(max_counter)].append(after-before)
                ed[s_map[current_string] + str(max_counter)].append(after-before)
    else:
        if lfs[4] != lts[2] and lts[2][-1] != 's' and lfs[4][-1] != 's':
            print 'error: ' + lfs[4] + '  ' + lts[2]

In [148]:
# first 3 are put and later 3 are get
for k in ed:
    ed[k] = [x.microseconds for x in ed[k]]
ed.keys()

['north-europe1',
 'north-europe0',
 'north-europe3',
 'north-europe2',
 'north-europe5',
 'north-europe4',
 'central5',
 'west14',
 'central0',
 'central3',
 'central2',
 'west10',
 'central4',
 'west12',
 'west13',
 'central1',
 'west15',
 'south-central3',
 'south-central2',
 'south-central1',
 'south-central0',
 'south-central5',
 'south-central4',
 'japan-east5',
 'japan-east4',
 'japan-east2',
 'japan-east1',
 'japan-east0',
 'japan-east3',
 'west11']

In [164]:
transfer_latency = [np.mean(ed['central3']), np.mean(ed['south-central3']), np.mean(ed['west13'])]
size = 128
cur_dir = 'azure_storage_latency_graph/no_tunneling'
to_dc = ['central', 'south-central', 'west1']
config, to_dcs = get_configurations(0)
from_dc = 'central'
storage_latency = []
file_format = 'azure_storage_latency_raw/{}/azure_storage_get_{}kb.txt_{}_to_{}.log'
for dc in to_dc:
    storage_latency.append(get_giza_latency_raw_dir_map(file_format.format(from_dc, size, from_dc, dc))['main'])

azure_latency = []
for i in range(len(storage_latency)):
    azure_latency.append(np.mean(storage_latency) - transfer_latency[i])

In [165]:
bar_graph_with_error(storage_latency, azure_latency, ('Transfer', 'Azure'), to_dc, 'Latency (ms)', '{}/{}-{}-{}'.format(cur_dir, config, 'put', '256kb'), 301)

In [160]:
west1put = [x.microseconds for x in ed['west10']]
np.mean(west1put)
size = 128
cur_dir = 'azure_storage_latency_graph/no_tunneling'
to_dc = ['central', 'south-central', 'west1']
config, to_dcs = get_configurations(0)
from_dc = 'central'
storage_latency = []
file_format = 'azure_storage_latency_raw/{}/azure_storage_get_{}kb.txt_{}_to_{}.log'
for dc in to_dc:
    storage_latency.append(get_giza_latency_raw_dir_map(file_format.format(from_dc, size, from_dc, dc))['main'])

AttributeError: 'int' object has no attribute 'microseconds'

In [43]:
l.split()

['19:39:02.010536',
 'IP',
 'blob.by4prdstr03a.store.core.windows.net.http',
 '>',
 '10.0.0.5.49750:',
 'Flags',
 '[S.],',
 'seq',
 '909808302,',
 'ack',
 '2022989648,',
 'win',
 '8192,',
 'options',
 '[mss',
 '1440,nop,wscale',
 '8,sackOK,TS',
 'val',
 '253666243',
 'ecr',
 '20875896],',
 'length',
 '0']

In [59]:
print lf[0].split()[4]
print lt[0:10]

blob.by4prdstr03a.store.core.windows.net.http:
['19:39:02.010536 IP blob.by4prdstr03a.store.core.windows.net.http > 10.0.0.5.49750: Flags [S.], seq 909808302, ack 2022989648, win 8192, options [mss 1440,nop,wscale 8,sackOK,TS val 253666243 ecr 20875896], length 0\n', '19:39:02.449507 IP blob.by4prdstr03a.store.core.windows.net.http > 10.0.0.5.49751: Flags [S.], seq 551815351, ack 1238777423, win 8192, options [mss 1440,nop,wscale 8,sackOK,TS val 1786257 ecr 20876005], length 0\n', '19:39:02.888681 IP blob.by4prdstr03a.store.core.windows.net.http > 10.0.0.5.49752: Flags [S.], seq 3044760159, ack 404026492, win 8192, options [mss 1440,nop,wscale 8,sackOK,TS val 255288053 ecr 20876115], length 0\n', '19:39:03.343625 IP blob.by4prdstr03a.store.core.windows.net.http > 10.0.0.5.49753: Flags [S.], seq 60333383, ack 1667997207, win 8192, options [mss 1440,nop,wscale 8,sackOK,TS val 251467366 ecr 20876229], length 0\n', '19:39:03.809915 IP blob.by4prdstr03a.store.core.windows.net.http > 10.0.0.

In [50]:
l = ['19:36', '19:35', '19:37']
l.sort()

In [51]:
l

['19:35', '19:36', '19:37']