In [72]:
from xetrack import Reader
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "iframe"


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = Reader('../output/stats.db').to_df()
# Filter last experiemnt
df = df[df['track_id']==df.tail(1)['track_id'].iloc[0]]
df = df[df['tech'] != 'shutil']
df = df[df['tech'] != 'm1']
df = df[df['step']>=0]
df['name'] = df['name'].str.replace('_upload', '').str.replace('_',' ')
df= df[df['name']!='lfs git']
# df['function'] = df['function'].str.replace('_',' ')
# df['function'] = df['function'].str.replace(' new upload','').str.replace(' merged upload','')
df['mb/s'] = df['file_bytes']/df['time']
df.to_csv('../output/results.csv', index=False)
errors = set(df['error'].fillna('').values)
if len(errors) > 1:
    print(f"Errors: {errors}")
print(f"Steps: {df['step'].max()+1}")
print(f"Data size: {len(df)}")
print("\nTime per tech - lower is better")
print(f"track_id: {df['track_id'].iloc[-1]}")
px.bar(df, x='name', y='time', color='tech').show()

Steps: 2
Data size: 12

Time per tech - lower is better
track_id: 881adb10-7da4-4264-8460-5185cac811e4


In [73]:
print("MB per Second - higher is better")
px.bar(df, x='name', y='mb/s', color='tech').show()

MB per Second - higher is better


In [74]:
pio.renderers.default='iframe'
groups = df.groupby('name')
sums = groups.sum()
fig1 = px.pie(sums, values='time', names=sums.index)
fig1.update_traces(textposition='inside', textinfo='percent+label')

fig2 = go.Figure()
for name, group in groups:
    fig2.add_trace(go.Scatter(x=group['step'], y=group['time'], mode='lines', name=str(name)))
fig2.update_layout(xaxis_title='Step', yaxis_title='Time')

df['cumulative_time'] = df.groupby('name')['time'].cumsum()
fig3 = px.line(df, x='step', y='cumulative_time', color='name')
fig3.update_layout(title='Cumulative Lines of Time per Steps', xaxis_title='Steps', yaxis_title='Cumulative Time')

df['tech_cumulative_time'] = df.groupby('tech')['time'].cumsum()

fig4 = px.line(df, x='step', y='tech_cumulative_time', color='tech')
fig4.update_layout(title='Cumulative Lines of Time per Steps',
                  xaxis_title='Steps',
                  yaxis_title='Cumulative Time')

fig1.show()

In [75]:
fig3.show()

In [56]:
fig2.show()

In [57]:
fig4.show()

In [69]:
for step in set(df['step']):
    print(f"Step {step} took {df[df['step']==step]['time'].sum()/60:.2f} minutes")
print(f"total time is: {df['time'].sum()/(60*60):.2f} hours")

Step 0 took 37.85 minutes
Step 1 took 35.83 minutes
total time is: 1.23 hours


In [70]:
times = df.groupby('function')['time'].mean()
ratios = times / times.min()
print(f"Average time:")
fig5 = px.pie(times, values='time', names=times.index)
fig5.update_traces(textposition='inside', textinfo='percent+label')
fig5.show()
ratios.sort_values()

Average time:


function
git-xet upload    1.000000
lakefs upload     1.160557
pyxet upload      1.176941
s3 new upload     1.330202
dvc upload        3.846026
lfs s3 upload     4.556604
Name: time, dtype: float64

In [13]:
df['track_id'].iloc[-1]

'dd264b1b-48ac-46db-8cc6-95479fa2ddf6'

In [51]:
df[df['tech']=='s3']

Unnamed: 0,timestamp,track_id,merged,step,filename,tech,p_memory_percent,memory_percent,cpu,function,bytes_sent,bytes_recv,disk_percent,name,time,error,file_bytes,is_merged,workflow,filepath,args,kwargs,pyxet_version,gitxet_version,n_rows_add,start_rows,columns,upload_time,pyxet,gitxet,mb/s
1937,05-09-2023 15:43:47.555535,5b54b895-2872-49d0-a1ac-1d5b325e24d5,False,0,,s3,221.903197,1381.248784,0.0,s3 new upload,2112.575195,131.231445,151.2,s3,216.37366,,7635.660055,,numeric,numeric/numeric.parquet,['numeric/numeric.parquet'],{},,,1.0,100000000.0,10.0,,0.1.3,gitxetcore 0.11.0-5dc8c78\n,35.289231
1944,05-09-2023 16:13:11.407709,5b54b895-2872-49d0-a1ac-1d5b325e24d5,False,1,,s3,294.21006,1322.292726,0.2,s3 new upload,174.791992,152.044922,155.4,s3,222.770629,,8017.938432,,numeric,numeric/numeric.parquet,['numeric/numeric.parquet'],{},,,1.0,100000000.0,10.0,,0.1.3,gitxetcore 0.11.0-5dc8c78\n,35.991901
