In [122]:
from xetrack import Reader
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# import plotly.io as pio
# pio.renderers.default='notebook'
WORKFLOW = 'append' # random, taxi
df = Reader('../output/stats.db').to_df()
# Filter last experiemnt
df = df[df['track_id']==df.tail(1)['track_id'].iloc[0]]
df = df[df['tech'] != 'shutil']
df = df[df['tech'] != 'm1']
df = df[df['step']>=0]
df['name'] = df['name'].str.replace('_',' ')
df['function'] = df['function'].str.replace('_',' ')
df['function'] = df['function'].str.replace(' new upload','').str.replace(' merged upload','')
df['mb/s'] = df['file_bytes']/df['time']
df.to_csv('../output/results.csv', index=False)
errors = set(df['error'].fillna('').values)
if len(errors) > 1:
    print(f"Errors: {errors}")
print(f"Steps: {df['step'].max()+1}")
print(f"Data size: {len(df)}")
px.bar(df, x='function', y='mb/s', color='tech').show()

Steps: 100
Data size: 700


In [119]:
groups = df.groupby('function')
sums = groups.sum()
fig1 = px.pie(sums, values='time', names=sums.index)
fig1.update_traces(textposition='inside', textinfo='percent+label')

fig2 = go.Figure()
for name, group in groups:
    fig2.add_trace(go.Scatter(x=group['step'], y=group['time'], mode='lines', name=str(name)))
fig2.update_layout(xaxis_title='Step', yaxis_title='Time')

df['cumulative_time'] = df.groupby('function')['time'].cumsum()
fig3 = px.line(df, x='step', y='cumulative_time', color='function')
fig3.update_layout(title='Cumulative Lines of Time per Steps', xaxis_title='Steps', yaxis_title='Cumulative Time')

df['tech_cumulative_time'] = df.groupby('tech')['time'].cumsum()

fig4 = px.line(df, x='step', y='tech_cumulative_time', color='tech')
fig4.update_layout(title='Cumulative Lines of Time per Steps',
                  xaxis_title='Steps',
                  yaxis_title='Cumulative Time')

fig1.show()
fig3.show()
fig2.show()
fig4.show()

In [120]:
for step in set(df['step']):
    print(f"Step {step} took {df[df['step']==step]['time'].sum()/60:.2f} minutes")
print(f"total time is: {df['time'].sum()/(60*60):.2f} hours")

Step 0 took 1.88 minutes
Step 1 took 2.40 minutes
Step 2 took 2.46 minutes
Step 3 took 2.01 minutes
Step 4 took 2.13 minutes
Step 5 took 2.16 minutes
Step 6 took 2.56 minutes
Step 7 took 2.03 minutes
Step 8 took 2.49 minutes
Step 9 took 2.46 minutes
Step 10 took 2.36 minutes
Step 11 took 2.53 minutes
Step 12 took 2.57 minutes
Step 13 took 2.75 minutes
Step 14 took 2.37 minutes
Step 15 took 2.33 minutes
Step 16 took 2.86 minutes
Step 17 took 2.11 minutes
Step 18 took 2.46 minutes
Step 19 took 2.62 minutes
Step 20 took 2.40 minutes
Step 21 took 2.75 minutes
Step 22 took 2.38 minutes
Step 23 took 2.50 minutes
Step 24 took 2.58 minutes
Step 25 took 2.44 minutes
Step 26 took 1.91 minutes
Step 27 took 2.59 minutes
Step 28 took 1.60 minutes
Step 29 took 2.28 minutes
Step 30 took 2.43 minutes
Step 31 took 2.57 minutes
Step 32 took 2.42 minutes
Step 33 took 2.74 minutes
Step 34 took 2.50 minutes
Step 35 took 2.20 minutes
Step 36 took 2.22 minutes
Step 37 took 2.21 minutes
Step 38 took 2.16 minu

In [86]:
times = df.groupby('function')['time'].mean()
ratios = times / times.min()
print(f"Average time:")
fig5 = px.pie(times, values='time', names=times.index)
fig5.update_traces(textposition='inside', textinfo='percent+label')
fig5.show()
ratios.sort_values()

Average time:


function
s3         1.000000
lakefs     1.198064
lfs git    1.363922
lfs s3     1.625733
pyxet      1.687559
dvc        2.010313
git-xet    2.070943
Name: time, dtype: float64