In [1]:
import sys
import os
import numpy
print(sys.executable)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import importlib
seaborn_found = importlib.util.find_spec('seaborn')
if seaborn_found is None:
    sys.write.stderr("[error] Seaborn package not found. exit")
    exit(-1)

import seaborn as sns
import pandas as pd

import matplotlib as mpl

#mpl.use("pgf")

import matplotlib.pyplot as plt

plt.rcParams.update({
    "font.family": "serif",  # use serif/main font for text elements
    "text.usetex": False,     # use inline math for ticks
    "pgf.rcfonts": False,    # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{units}",          # load additional packages
         "\\usepackage{metalogo}",
         "\\usepackage{unicode-math}",   # unicode math setup
         r"\setmathfont{xits-math.otf}",
         ]
})

# plt.rc('text', usetex=False)

plt.rcParams.update({
    "font.family": "serif",
    "font.serif": [],                    # use latex default serif font
    "font.size": "18",
})

plt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    "pgf.preamble": [
         r"\usepackage[utf8x]{inputenc}",
         r"\usepackage[T1]{fontenc}",
         r"\usepackage{cmbright}",
         ]
})


FONT=18

sns.set_context("paper", font_scale=1.3, rc={"lines.linewidth": 1.5, "axes.labelsize":FONT, "axes.titlesize":FONT, "legend.fontsize":FONT, "xtick.labelsize":FONT, "ytick.labelsize":FONT, "font.size":FONT})


sns.set_style("darkgrid", {'xtick.bottom': True, 'xtick.top': False, 'ytick.left': True, 'ytick.right': False, 'axes.grid': True, "axes.facecolor": ".9", 'grid.linestyle': '-', 'lines.linewidth':2.5})
# sns.set_palette("dark")

mpl.pyplot.close("all")


tasks = ["STAGEIN_TIME_S", "RESAMPLE_TIME_S", "COMBINE_TIME_S", "MAKESPAN_S"]
DIR="/Users/lpottier/research/usc-isi/projects/active/workflow-io-bb/data/traces/swarp/"

/usr/local/opt/python/bin/python3.7


In [2]:
csv_file_cori = DIR+"/shared-cori/bb_runs2020-multipipeline-1C.csv"
csv_file_summit = DIR+"/"+"private-summit/runs-multi-pipelines-1c-clean.csv"

CORE="1"
# PLOT_DIR="/Users/lpottier/research/usc-isi/projects/active/paper-workflow-bb/figures/plots/real-swarp"
PLOT_DIR=os.getcwd()
OUTPUT = PLOT_DIR+"/"+"swarp-Xw-"+CORE+"c/"
fig_size_square = (5, 4)
fig_size_rect = (6.5, 4)

In [3]:

try:
    # Create target Directory
    os.mkdir(OUTPUT)
    print("Directory " , OUTPUT ,  " Created ") 
except FileExistsError:
    print("Directory " , OUTPUT ,  " already exists")


#csv_file = "swarp_test_switches.csv"
df_cori = pd.read_csv(csv_file_cori, sep=' ')
df_summit = pd.read_csv(csv_file_summit, sep=' ')

df_swarp = pd.concat([df_cori, df_summit])

df_swarp_priv = df_swarp[df_swarp.BB_TYPE=="PRIVATE"]
df_swarp_strip = df_swarp[df_swarp.BB_TYPE=="STRIPED"]
df_swarp_onnode = df_swarp[df_swarp.BB_TYPE=="ONNODE"]


# print(df_swarp.BB_NB_FILES.unique())
# df_swarp = df_swarp.loc[df_swarp['BB_NB_FILES'].isin([0,8,16,24,32])]
# print(df_swarp.BB_NB_FILES.unique())

print(df_swarp.info())
print(df_swarp.NB_CORES.unique())
print(df_swarp.FITS.unique())
print(df_swarp.BB_TYPE.unique())
print(df_swarp.NB_PIPELINE.unique())
## Use the full data

is_FITS =  df_swarp['FITS']=="Y"
is_PRIVATE =  df_swarp['BB_TYPE']=="PRIVATE"
is_STRIPED =  df_swarp['BB_TYPE']=="STRIPED"
is_ONNODE =  df_swarp['BB_TYPE']=="ONNODE"

# print(df_swarp.describe())

agg_swarp = df_swarp.groupby(['ID', 'NB_PIPELINE','FITS','BB_SIZE_FILES_MB', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False).agg({'MAKESPAN_S': 'max', 'COMBINE_TIME_S': 'max', 'RESAMPLE_TIME_S': 'max', 'STAGEIN_TIME_S': 'min'})
print(agg_swarp.NB_PIPELINE.unique())

agg_maxmin = df_swarp.groupby(['ID', 'NB_PIPELINE','FITS','BB_SIZE_FILES_MB', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False)['MAKESPAN_S','COMBINE_TIME_S','RESAMPLE_TIME_S','STAGEIN_TIME_S'].agg(numpy.ptp)
#agg_maxmin = df_swarp.groupby(['ID', 'NB_PIPELINE','FITS', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False)['MAKESPAN_S'].apply(lambda g: g.min())

#print(df_swarp.BB_SIZE_FILES_MB)


Directory  /Users/lpottier/research/usc-isi/projects/active/workflow-io-bb/data/traces/notebooks/swarp-Xw-1c/  already exists
<class 'pandas.core.frame.DataFrame'>
Int64Index: 225 entries, 0 to 74
Data columns (total 25 columns):
AVG                    225 non-null int64
BANDWIDTH_MBS          75 non-null float64
BB_ALLOC_SIZE_MB       225 non-null float64
BB_NB_FILES            225 non-null int64
BB_SIZE_FILES_MB       225 non-null float64
BB_TYPE                225 non-null object
COMBINE_TIME_S         225 non-null float64
COMBINE_WALLTIME_S     225 non-null float64
END                    0 non-null float64
FITS                   225 non-null object
ID                     225 non-null int64
MAKESPAN_S             225 non-null float64
NB_CORES               225 non-null int64
NB_PIPELINE            225 non-null int64
PIPELINE               225 non-null int64
RESAMPLE_TIME_S        225 non-null float64
RESAMPLE_WALLTIME_S    225 non-null float64
STAGEIN_TIME_S         225 non-null flo

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]
  return ptp(axis=axis, out=out, **kwargs)


In [4]:
xlabel='Number of SWARP pipelines'
ylabel='Execution time (s)'
legend_label=['Private (Cori)', 'Striped (Cori)', 'On-node (Summit)']

In [5]:

def plot_lineplot_per_task(df, col, output, loc='upper left'):
    
    xlabel='Number of SWARP pipelines'
    ylabel='Execution time (s)'
#     legend_label=['Private (Cori)', 'Striped (Cori)', 'On-node (Summit)']
    
    name = col.split("_")[0].lower()
    
    df2 = df.copy()
    
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("PRIVATE", 'Private (Cori)', case = False)
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("STRIPED", 'Striped (Cori)', case = False)
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("ONNODE", 'On-node (Summit)', case = False)
#     df2.rename(columns={'BB_TYPE':'Burst Buffers design'}, inplace=True)

    f = plt.figure(figsize=fig_size_rect)
    f.patch.set_alpha(0)
    
    g = sns.lineplot(x="NB_PIPELINE", y=col, hue="BB_TYPE", style="BB_TYPE", data=df2, markers=True, palette={'Private (Cori)': 'b', 'Striped (Cori)': 'r', 'On-node (Summit)': 'g'})
    handles, labels = g.get_legend_handles_labels()
    g.legend(title='', ncol=1, frameon=False, handles=handles[1:], labels=labels[1:])
    
    g.set(xlabel=xlabel, ylabel=ylabel, xticks=df.NB_PIPELINE.unique())

    plt.tight_layout()
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()

loc=['upper left', 'lower right', 'upper left', 'lower right']

for i,task in enumerate(tasks):
    print("Plot line fits task:", task)
    plot_lineplot_per_task(agg_swarp, task, output="-fits-1w-Xc-32f", loc=loc[i])
    

Plot line fits task: STAGEIN_TIME_S




Plot line fits task: RESAMPLE_TIME_S




Plot line fits task: COMBINE_TIME_S




Plot line fits task: MAKESPAN_S




In [27]:
def plot_boxplot_allmode_task(df, col, output):
    #df = df[df['STAGEIN_TIME_S'] < ]
    xlabel='Number of SWARP pipelines'
    ylabel='Execution time (s)'
    
    df2 = df.copy()
    
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("PRIVATE", 'Private (Cori)', case = False)
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("STRIPED", 'Striped (Cori)', case = False)
    df2["BB_TYPE"]= df2["BB_TYPE"].str.replace("ONNODE", 'On-node (Summit)', case = False)
#     df2.rename(columns={'BB_TYPE':'Burst Buffers design'}, inplace=True)


    name = col.split("_")[0].lower()
    
    f = plt.figure(figsize=(8,5.5))
    f.patch.set_alpha(0)
    
    g = sns.boxplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", hue="BB_TYPE", data=df2, dodge=True, showfliers=False)
#     g.add_legend(frameon=False, bbox_to_anchor=(1.05, 0))
#     handles, labels = g.get_legend_handles_labels()
    g.legend(title='', ncol=2, frameon=False)
    
    g.set(xlabel=xlabel, ylabel=ylabel)

    plt.tight_layout()
    plt.savefig(OUTPUT+'swarp-box-allmode-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+'swarp-box-allmode-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()

    
for i,task in enumerate(tasks):
    print("Plot box all mode:", task)
    plot_boxplot_allmode_task(df=df_swarp[is_FITS], col=task, output="-fits-1w-Xc-32f" )
    

Plot box all mode: STAGEIN_TIME_S


  app.launch_new_instance()
  app.launch_new_instance()


Plot box all mode: RESAMPLE_TIME_S


  app.launch_new_instance()


Plot box all mode: COMBINE_TIME_S


  app.launch_new_instance()


Plot box all mode: MAKESPAN_S




In [7]:

xlabel='Input files in burst buffers (%)'
legend_label=['Private', 'Striped']

def plot_boxplot_all_tasks(df, output, print_dots=True):

    with sns.light_palette("green"):

        #print(test.info())

        f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)
        f.patch.set_alpha(0)

        g1 = sns.boxplot(x="NB_PIPELINE", y="STAGEIN_TIME_S", data=df, ax=axes[0,0])
        if print_dots:
            g1 = sns.swarmplot(x="NB_PIPELINE", y="STAGEIN_TIME_S", data=df, color=".25", ax=axes[0,0])
        g1.set(xlabel='', ylabel='Stage in time (s)')

        g2 = sns.boxplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=df, ax=axes[0,1])
        if print_dots:
            g2 = sns.swarmplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=df, color=".25", ax=axes[0,1])
        g2.set(xlabel='', ylabel='Resample time (s)')

        g3 = sns.boxplot(x="NB_PIPELINE", y="COMBINE_TIME_S", data=df, ax=axes[1,0])
        if print_dots:
            g3 = sns.swarmplot(x="NB_PIPELINE", y="COMBINE_TIME_S", data=df, color=".25", ax=axes[1,0])
        g3.set(xlabel=xlabel, ylabel='Combine time (s)')

        g4 = sns.boxplot(x="NB_PIPELINE", y="MAKESPAN_S", data=df, ax=axes[1,1])
        if print_dots:
            g4 = sns.swarmplot(x="NB_PIPELINE", y="MAKESPAN_S", data=df, color=".25", ax=axes[1,1])
        g4.set(xlabel=xlabel, ylabel='Makespan (s)')

        plt.tight_layout()
#       plt.savefig(OUTPUT+output+'.pdf')
#       plt.savefig(OUTPUT+output+'.pgf')
        plt.show() 


plot_boxplot_all_tasks(agg_swarp[agg_swarp.BB_TYPE=="PRIVATE"], output="swarp-box-fits-private-1w-32c")




In [8]:
plot_boxplot_all_tasks(agg_swarp[agg_swarp.BB_TYPE=="STRIPED"], output="swarp-box-fits-striped-1w-32c")



In [9]:
def plot_boxplot_per_task(df, col, output, print_dots=True):
    #df = df[df['STAGEIN_TIME_S'] < ]
    xlabel='Number of SWARP pipelines'
    ylabel='Execution time (s)'
    legend_label=['Private', 'Striped']
    
    name = col.split("_")[0].lower()
    
    f = plt.figure(figsize=fig_size_rect)
    f.patch.set_alpha(0)
    
    with sns.light_palette("green"):
        g1 = sns.boxplot(x="NB_PIPELINE", y=col, data=df)
        if print_dots:
            g1 = sns.swarmplot(x="NB_PIPELINE", y=col, data=df, color=".25")
        g1.set(xlabel=xlabel, ylabel=ylabel)

        plt.tight_layout()
        plt.savefig(OUTPUT+'swarp-box-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
        plt.savefig(OUTPUT+'swarp-box-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
        plt.show()

loc=['upper left', 'lower right', 'upper left', 'lower right']

# for i,task in enumerate(tasks):
#     print("Plot line fits task:", task)
plot_boxplot_per_task(agg_swarp[agg_swarp.BB_TYPE=="PRIVATE"], "RESAMPLE_TIME_S", output="-fits-private-1w-Xc-32f")
plot_boxplot_per_task(agg_swarp[agg_swarp.BB_TYPE=="STRIPED"], "RESAMPLE_TIME_S", output="-fits-striped-1w-Xc-32f")
plot_boxplot_per_task(agg_swarp[agg_swarp.BB_TYPE=="ONNODE"], "RESAMPLE_TIME_S", output="-fits-onnode-1w-Xc-32f")





In [10]:
print_dots = True
xlabel='Number of SWARP pipelines'
ylabel='Resample execution time (s)'

with sns.light_palette("green"):
    f, axes = plt.subplots(1, 2, figsize=(9, 4.5), sharex=True, sharey=True)
    f.patch.set_alpha(0)
    
    g1 = sns.boxplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=agg_swarp[agg_swarp.BB_TYPE=="PRIVATE"], ax=axes[0])
    if print_dots:
        g1 = sns.swarmplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=agg_swarp[agg_swarp.BB_TYPE=="PRIVATE"], color=".25", ax=axes[0])
    g1.set(xlabel=xlabel+' with a private allocation', ylabel=ylabel)
        
with sns.light_palette("red"):
    g2 = sns.boxplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=agg_swarp[agg_swarp.BB_TYPE=="STRIPED"], ax=axes[1])
    if print_dots:
        g2 = sns.swarmplot(x="NB_PIPELINE", y="RESAMPLE_TIME_S", data=agg_swarp[agg_swarp.BB_TYPE=="STRIPED"], color=".25", ax=axes[1])
    g2.set(xlabel=xlabel+' with a striped allocation', ylabel='')

    plt.tight_layout()
    plt.savefig(OUTPUT+'swarp-box-resample-1w-Xc-32f.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+'swarp-box-resample-1w-Xc-32f.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()



In [11]:
print_dots = True
xlabel='Number of SWARP pipelines'
ylabel='Bandwidth (MB/s)'
legend_label=['Private', 'Striped']

agg_swarp['bw'] = agg_swarp['BB_SIZE_FILES_MB'] / agg_swarp['STAGEIN_TIME_S']

# bandwidth_swarp = df_swarp.groupby(['ID', 'NB_PIPELINE','FITS', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False).agg(lambda g: g['BB_SIZE_FILES_MB'].max()/g['STAGEIN_TIME_S'].max())

#print(bandwidth_swarp)

# f, axes = plt.subplots(1, 2, figsize=(9, 4.5), sharex=True, sharey=True)
f = plt.figure(figsize=fig_size_rect)
f.patch.set_alpha(0)

g1 = sns.lineplot(x="NB_PIPELINE", y="bw", data=agg_swarp, hue="BB_TYPE", style="BB_TYPE", markers=True)
g1.set(xlabel=xlabel, ylabel=ylabel)

# g1.set_xscale('log')
# g1.set_xticks(agg_swarp.NB_PIPELINE.unique())
# g1.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())

plt.tight_layout()
plt.savefig(OUTPUT+'swarp-bandwidth-1w-Xc-32f.pdf', facecolor=f.get_facecolor(), edgecolor='none')
plt.savefig(OUTPUT+'swarp-bandwidth-1w-Xc-32f.pgf', facecolor=f.get_facecolor(), edgecolor='none')
plt.show()



In [12]:
is_One_pipe =  df_swarp['NB_PIPELINE']=="1"
is_32pipe =  df_swarp['NB_PIPELINE']=="32"

df_by_pipeline = df_swarp[is_FITS & is_PRIVATE].groupby('NB_PIPELINE').aggregate(['mean', 'max','min'])
df_by_pipeline.head()

print("PRIVATE: max slowdown makespan : ", max(df_by_pipeline.MAKESPAN_S['mean'])/min(df_by_pipeline.MAKESPAN_S['mean']))
print("PRIVATE: max slowdown resample : ", max(df_by_pipeline.RESAMPLE_TIME_S['mean'])/min(df_by_pipeline.RESAMPLE_TIME_S['mean']))
print("PRIVATE: max slowdown stagein  : ", max(df_by_pipeline.STAGEIN_TIME_S['mean'])/min(df_by_pipeline.STAGEIN_TIME_S['mean']))
print("PRIVATE: max slowdown combine  : ", max(df_by_pipeline.COMBINE_TIME_S['mean'])/min(df_by_pipeline.COMBINE_TIME_S['mean']))

PRIVATE: max slowdown makespan :  1.6474340784256343
PRIVATE: max slowdown resample :  1.288201781467465
PRIVATE: max slowdown stagein  :  1.5296404830989374
PRIVATE: max slowdown combine  :  3.4473742365477067


  result = method(y)


In [13]:
is_One_pipe =  df_swarp['NB_PIPELINE']=="1"
is_32pipe =  df_swarp['NB_PIPELINE']=="32"

df_by_pipeline = df_swarp[is_FITS & is_STRIPED].groupby('NB_PIPELINE').aggregate(['mean', 'max','min'])
df_by_pipeline.head()

print("STRIPED: max slowdown makespan : ", max(df_by_pipeline.MAKESPAN_S['mean'])/min(df_by_pipeline.MAKESPAN_S['mean']))
print("STRIPED: max slowdown resample : ", max(df_by_pipeline.RESAMPLE_TIME_S['mean'])/min(df_by_pipeline.RESAMPLE_TIME_S['mean']))
print("STRIPED: max slowdown stagein  : ", max(df_by_pipeline.STAGEIN_TIME_S['mean'])/min(df_by_pipeline.STAGEIN_TIME_S['mean']))
print("STRIPED: max slowdown combine  : ", max(df_by_pipeline.COMBINE_TIME_S['mean'])/min(df_by_pipeline.COMBINE_TIME_S['mean']))

STRIPED: max slowdown makespan :  1.5344359303775905
STRIPED: max slowdown resample :  1.2586218755509058
STRIPED: max slowdown stagein  :  1.0960218444041618
STRIPED: max slowdown combine  :  2.266869294657646


In [14]:
is_One_pipe =  df_swarp['NB_PIPELINE']=="1"
is_32pipe =  df_swarp['NB_PIPELINE']=="32"

df_by_pipeline = df_swarp[is_FITS & is_ONNODE].groupby('NB_PIPELINE').aggregate(['mean', 'max','min'])
df_by_pipeline.head()

print("ONNODE: max slowdown makespan : ", max(df_by_pipeline.MAKESPAN_S['mean'])/min(df_by_pipeline.MAKESPAN_S['mean']))
print("ONNODE: max slowdown resample : ", max(df_by_pipeline.RESAMPLE_TIME_S['mean'])/min(df_by_pipeline.RESAMPLE_TIME_S['mean']))
print("ONNODE: max slowdown stagein  : ", max(df_by_pipeline.STAGEIN_TIME_S['mean'])/min(df_by_pipeline.STAGEIN_TIME_S['mean']))
print("ONNODE: max slowdown combine  : ", max(df_by_pipeline.COMBINE_TIME_S['mean'])/min(df_by_pipeline.COMBINE_TIME_S['mean']))

ONNODE: max slowdown makespan :  1.2058043506661011
ONNODE: max slowdown resample :  1.0057196416969332
ONNODE: max slowdown stagein  :  1.108533884504612
ONNODE: max slowdown combine  :  4.380669891393629
