In [76]:
from xetrack import Reader
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# import plotly.io as pio
# pio.renderers.default='notebook'
WORKFLOW = 'append' # random, taxi
df = Reader('../output/stats.db').to_df()
# Filter last experiemnt
df = df[df['track_id']==df.tail(1)['track_id'].iloc[0]]
df = df[df['tech'] != 'shutil']
df = df[df['tech'] != 'm1']
df = df[df['step']>=0]
df['name'] = df['name'].str.replace('_',' ')
df['function'] = df['function'].str.replace('_',' ')
df['function'] = df['function'].str.replace(' new upload','').str.replace(' merged upload','')
df['mb/s'] = df['file_bytes']/df['time']
df.to_csv('../output/results.csv', index=False)
errors = set(df['error'].fillna('').values)
if len(errors) > 1:
    print(f"Errors: {errors}")
print(f"Steps: {df['step'].max()+1}")
print(f"Data size: {len(df)}")
px.bar(df, x='function', y='mb/s', color='tech').show()

Steps: 33
Data size: 230


In [69]:
groups = df.groupby('function')
sums = groups.sum()
fig1 = px.pie(sums, values='time', names=sums.index)
fig1.update_traces(textposition='inside', textinfo='percent+label')

fig2 = go.Figure()
for name, group in groups:
    fig2.add_trace(go.Scatter(x=group['step'], y=group['time'], mode='lines', name=str(name)))
fig2.update_layout(xaxis_title='Step', yaxis_title='Time')

df['cumulative_time'] = df.groupby('function')['time'].cumsum()
fig3 = px.line(df, x='step', y='cumulative_time', color='function')
fig3.update_layout(title='Cumulative Lines of Time per Steps', xaxis_title='Steps', yaxis_title='Cumulative Time')

df['tech_cumulative_time'] = df.groupby('tech')['time'].cumsum()

fig4 = px.line(df, x='step', y='tech_cumulative_time', color='tech')
fig4.update_layout(title='Cumulative Lines of Time per Steps',
                  xaxis_title='Steps',
                  yaxis_title='Cumulative Time')

fig1.show()
fig3.show()
fig2.show()
fig4.show()

In [70]:
for step in set(df['step']):
    print(f"Step {step} took {df[df['step']==step]['time'].sum()/60:.2f} minutes")
print(f"total time is: {df['time'].sum()/(60*60):.2f} hours")

Step 0 took 3.41 minutes
Step 1 took 2.05 minutes
Step 2 took 2.88 minutes
Step 3 took 2.21 minutes
Step 4 took 2.88 minutes
Step 5 took 2.48 minutes
Step 6 took 2.89 minutes
Step 7 took 2.92 minutes
Step 8 took 2.55 minutes
Step 9 took 2.43 minutes
Step 10 took 2.72 minutes
Step 11 took 3.54 minutes
Step 12 took 5.07 minutes
Step 13 took 2.59 minutes
Step 14 took 2.66 minutes
Step 15 took 2.98 minutes
Step 16 took 2.20 minutes
Step 17 took 2.92 minutes
Step 18 took 6.44 minutes
Step 19 took 2.61 minutes
Step 20 took 6.15 minutes
Step 21 took 5.69 minutes
Step 22 took 2.78 minutes
Step 23 took 2.68 minutes
Step 24 took 2.45 minutes
Step 25 took 3.95 minutes
Step 26 took 7.19 minutes
Step 27 took 3.41 minutes
Step 28 took 2.93 minutes
Step 29 took 4.30 minutes
Step 30 took 4.55 minutes
Step 31 took 2.50 minutes
Step 32 took 2.42 minutes
total time is: 1.86 hours


In [71]:
times = df.groupby('function')['time'].mean()
ratios = times / times.min()
print(f"Average time:")
fig5 = px.pie(times, values='time', names=times.index)
fig5.update_traces(textposition='inside', textinfo='percent+label')
fig5.show()
ratios.sort_values()

Average time:


function
lakefs     1.000000
pyxet      1.068853
s3         1.312913
lfs git    1.433272
git-xet    1.613705
lfs s3     1.854389
dvc        2.108858
Name: time, dtype: float64