In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import os
import shutil

%matplotlib

Using matplotlib backend: Qt5Agg


In [2]:
files = [10,100,1000,10000]
#files = [100]

In [3]:
# Magic function

def get_Toverlap(d, start_state, stop_state):
    '''
    Helper function to create the list of lists from which to calculate the
    overlap of the elements of a DataFrame between the two boundaries passed as
     arguments.
    '''

    overlap = 0
    ranges = []

    for obj, states in d.iteritems():
        #print states
        ranges.append([states[start_state], states[stop_state]])

    for crange in collapse_ranges(ranges):
        overlap += crange[1] - crange[0]
    
    return overlap

def collapse_ranges(ranges):
    """
    given be a set of ranges (as a set of pairs of floats [start, end] with
    'start <= end'. This algorithm will then collapse that set into the
    smallest possible set of ranges which cover the same, but not more nor
    less, of the domain (floats).
    
    We first sort the ranges by their starting point. We then start with the
    range with the smallest starting point [start_1, end_1], and compare to the
    next following range [start_2, end_2], where we now know that start_1 <=
    start_2. We have now two cases:
    
    a) when start_2 <= end_1, then the ranges overlap, and we collapse them
    into range_1: range_1 = [start_1, max[end_1, end_2]
    
    b) when start_2 > end_2, then ranges don't overlap. Importantly, none of
    the other later ranges can ever overlap range_1. So we move range_1 to
    the set of final ranges, and restart the algorithm with range_2 being
    the smallest one.
    
    Termination condition is if only one range is left -- it is also moved to
    the list of final ranges then, and that list is returned.
    """

    final = []

    # sort ranges into a copy list
    _ranges = sorted (ranges, key=lambda x: x[0])
        
    START = 0
    END = 1

    base = _ranges[0] # smallest range

    for _range in _ranges[1:]:

        if _range[START] <= base[END]:
            # ranges overlap -- extend the base
            base[END] = max(base[END], _range[END])

        else:

            # ranges don't overlap -- move base to final, and current _range
            # becomes the new base
            final.append(base)
            base = _range

    # termination: push last base to final
    final.append(base)

    return final


In [4]:
# EnTK duration

def entk_duration(fname):
    
    df = pd.read_csv(fname, sep=',', skiprows=9, index_col=False,names =['time','name','uid','state','event','iteration','stage','task','msg'])
    df.drop(df.columns[[1,2,3,4]], axis=1, inplace=True)
    #print df

    super_dict = dict()
    for row in df.iterrows():
        row=row[1]
        time = float(row['time'])
        task = int(row['task'].split(':')[1].strip())
        msg = row['msg'].strip()
        
        if msg == 'submitted':
            start_time = time            
            
        if msg =='done':
            
            if task not in super_dict:
                super_dict[task] = dict()
            
            if 'start' not in super_dict[task]:
                super_dict[task]['start'] = start_time
            
            if 'stop' not in super_dict[task]:
                super_dict[task]['stop'] = time
                
    return get_Toverlap(super_dict, 'start', 'stop')

In [7]:
def plot(df):
    
    FONTSIZE=18
    ax = df.plot(kind='line', fontsize=FONTSIZE,
                title=   'Total time to execution as a function of the number of files \n'+
                         'staged in each task (number of tasks = 16)'
                )
    
    ax.set_xlabel('Number of files staged in each task', fontsize=FONTSIZE)
    ax.set_ylabel('Total time to execution', fontsize=FONTSIZE)
    ax.set_title(ax.get_title(), fontsize=FONTSIZE)
    
    

In [8]:
df = pd.DataFrame(columns=['TTE'])

for f in files:
   
    fname = glob('../raw_data/data-{0}/entk.app*.prof'.format(f))[0]
    shutil.copy(fname, 'temp-{0}.csv'.format(f))
    tte = entk_duration('temp-{0}.csv'.format(f))
    os.remove('temp-{0}.csv'.format(f))
    #print tte
    df.loc[f] = [tte]

plot(df)