In [45]:
from xetrack import Reader
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "iframe"


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = Reader('../output/stats.db').to_df()
# Filter last experiemnt
df = df[df['track_id']==df.tail(1)['track_id'].iloc[0]]
df = df[df['tech'] != 'shutil']
df = df[df['tech'] != 'm1']
df = df[df['step']>=0]
df['name'] = df['name'].str.replace('_upload', '').str.replace('_',' ')
# df['function'] = df['function'].str.replace('_',' ')
# df['function'] = df['function'].str.replace(' new upload','').str.replace(' merged upload','')
df['mb/s'] = df['file_bytes']/df['time']
df.to_csv('../output/results.csv', index=False)
errors = set(df['error'].fillna('').values)
if len(errors) > 1:
    print(f"Errors: {errors}")
print(f"Steps: {df['step'].max()+1}")
print(f"Data size: {len(df)}")
print("\nTime per tech - lower is better")
px.bar(df, x='name', y='time', color='tech').show()

Steps: 4
Data size: 23

Time per tech - lower is better


In [46]:
print("MB per Second - higher is better")
px.bar(df, x='name', y='mb/s', color='tech').show()

MB per Second - higher is better


In [31]:
pio.renderers.default='iframe'
groups = df.groupby('name')
sums = groups.sum()
fig1 = px.pie(sums, values='time', names=sums.index)
fig1.update_traces(textposition='inside', textinfo='percent+label')

fig2 = go.Figure()
for name, group in groups:
    fig2.add_trace(go.Scatter(x=group['step'], y=group['time'], mode='lines', name=str(name)))
fig2.update_layout(xaxis_title='Step', yaxis_title='Time')

df['cumulative_time'] = df.groupby('name')['time'].cumsum()
fig3 = px.line(df, x='step', y='cumulative_time', color='tech')
fig3.update_layout(title='Cumulative Lines of Time per Steps', xaxis_title='Steps', yaxis_title='Cumulative Time')

df['tech_cumulative_time'] = df.groupby('tech')['time'].cumsum()

fig4 = px.line(df, x='step', y='tech_cumulative_time', color='tech')
fig4.update_layout(title='Cumulative Lines of Time per Steps',
                  xaxis_title='Steps',
                  yaxis_title='Cumulative Time')

fig1.show()

In [32]:
fig3.show()

In [33]:
fig2.show()

In [34]:
fig4.show()

In [23]:
for step in set(df['step']):
    print(f"Step {step} took {df[df['step']==step]['time'].sum()/60:.2f} minutes")
print(f"total time is: {df['time'].sum()/(60*60):.2f} hours")

Step 0 took 1.98 minutes
Step 1 took 2.18 minutes
Step 2 took 1.91 minutes
Step 3 took 1.89 minutes
Step 4 took 2.24 minutes
Step 5 took 2.19 minutes
Step 6 took 2.42 minutes
Step 7 took 2.42 minutes
Step 8 took 2.46 minutes
Step 9 took 2.26 minutes
Step 10 took 1.91 minutes
Step 11 took 2.04 minutes
Step 12 took 2.19 minutes
Step 13 took 2.16 minutes
Step 14 took 1.54 minutes
Step 15 took 1.87 minutes
Step 16 took 2.10 minutes
Step 17 took 2.53 minutes
Step 18 took 2.19 minutes
Step 19 took 1.89 minutes
Step 20 took 2.15 minutes
Step 21 took 0.63 minutes
total time is: 0.75 hours


In [24]:
times = df.groupby('function')['time'].mean()
ratios = times / times.min()
print(f"Average time:")
fig5 = px.pie(times, values='time', names=times.index)
fig5.update_traces(textposition='inside', textinfo='percent+label')
fig5.show()
ratios.sort_values()

Average time:


function
lfs s3 upload     1.000000
lfs git upload    1.114282
lakefs upload     1.200398
s3 new upload     1.293673
dvc upload        2.093847
pyxet upload      3.022268
git-xet upload    4.142139
Name: time, dtype: float64