In [17]:
import sys
import os
import humanize

In [18]:
# Link BigPy
sys.path.insert(0, '/home/av/AI-Lab/BigPy/')
from bigpy.bench_report import filtered  # nopep8

In [19]:
import plotly.express as px
import pandas as pd
import numpy as np

In [20]:
backend_names =  [
    'Pandas', 
    'Modin', 
    'CuDF', 
    # 'Dask-CuDF', 
    'SQLite',
    # 'Spark',
]
benches = pd.read_json('report/results.json', orient='records')

In [21]:
benches['mean_duration'] = benches['seconds'] / benches['iterations']
workloads = benches['operation'].unique().tolist()
sizes = sorted(benches['dataset_bytes'].unique())

In [22]:
workloads_names = [
    'Parsing Parquet Files', 
    'Query 1: Rides by Different Vendors',
    'Query 2: Mean Ride Prices',
    'Query 3: Rides by Vendor and Year',
    'Query 4: Rides by Vendor and Year and Distance, Sorted',
]
workloads

['Parse', 'Query 1', 'Query 2', 'Query 3', 'Query 4', 'Close']

In [23]:
sorted(benches['backend'].unique())

['CuDF', 'Modin', 'Pandas', 'SQLite']

In [24]:
benches

Unnamed: 0,backend,operation,dataset,dataset_bytes,seconds,iterations,error,mean_duration
0,Pandas,Parse,462.0 MB,461966527,10.507544,3,,3.502515e+00
1,Pandas,Query 1,462.0 MB,461966527,10.554182,14,,7.538701e-01
2,Pandas,Query 2,462.0 MB,461966527,10.060498,49,,2.053163e-01
3,Pandas,Query 3,462.0 MB,461966527,10.393981,8,,1.299248e+00
4,Pandas,Query 4,462.0 MB,461966527,10.310615,7,,1.472945e+00
...,...,...,...,...,...,...,...,...
81,SQLite,Close,25.5 GB,25496916389,10.000001,22940437,,4.359115e-07
82,Pandas,Close,39.7 GB,39704777198,10.000000,22930864,,4.360935e-07
83,Modin,Close,39.7 GB,39704777198,10.000000,23121269,,4.325022e-07
84,CuDF,Close,39.7 GB,39704777198,10.000001,23157732,,4.318212e-07


In [30]:
for workload, workload_name in zip(workloads, workloads_names):

    # For every workload generate a performance chart
    speedups_dfs = []
    baselines = filtered(benches, operation=workload, backend=backend_names[0])['mean_duration'].to_numpy()
    for backend_source, backend_name in zip(backend_names, backend_names):
        measurements = filtered(benches, operation=workload, backend=backend_source)['mean_duration'].to_numpy()
        shortest_len = min(len(baselines), len(measurements))
        speedups = baselines[:shortest_len] / measurements[:shortest_len]
        df = pd.DataFrame({
            'Speedup over Pandas': speedups.copy(),
            'Dataset Size, bytes': sizes[:shortest_len],
            'Backend': [backend_name]*shortest_len,
        })
        speedups_dfs.append(df)

    speedups_df = pd.concat(speedups_dfs, ignore_index=True)
    fig = px.line(speedups_df, 
        x='Dataset Size, bytes', 
        y='Speedup over Pandas', 
        color='Backend', 
        title=workload_name, 
        log_y=True,
        log_x=True,
    )
    fig.show()
    fig.write_image('report/chart_' + workload + '.svg')

    # For every workload, print a table
    best_cpu = filtered(benches, operation=workload, backend='Pandas')['mean_duration'].to_numpy()
    best_gpu = filtered(benches, operation=workload, backend='Modin')['mean_duration'].to_numpy()
    count_results = min(len(best_cpu), len(best_gpu))
    speedups = best_cpu[:count_results] / best_gpu[:count_results]
    speedups = ['{0:.1f}x'.format(x) for x in speedups]
    speedups = ' | '.join(speedups)
    print(speedups)
    

2.0x | 1.6x | 1.6x


1.8x | 2.5x | 3.4x


0.5x | 0.4x | 0.5x


1.0x | 1.0x | 1.0x


0.6x | 0.6x | 0.6x


In [1]:
speedups_df

NameError: name 'speedups_df' is not defined