In [3]:
import sys
import os
import humanize

In [4]:
# Link BigPy
sys.path.insert(0, '/home/av/AI-Lab/BigPy/')
from bigpy.bench_report import filtered  # nopep8

In [5]:
import plotly.express as px
import pandas as pd
import numpy as np

In [11]:
backend_sources = [
    'pandas', 
    'modin', 
    'cudf', 
    # 'dask_cudf', 
    'sqlite',
    # 'spark',
]
backend_names =  [
    'Pandas', 
    'Modin', 
    'CuDF', 
    # 'Dask-CuDF', 
    'SQLite',
    # 'Spark',
]
benches = pd.read_json('benchmark.json', orient='records')

In [12]:
benches['mean_duration'] = benches['seconds'] / benches['iterations']
workloads = benches['operation'].unique().tolist()
sizes = sorted(benches['size_bytes'].unique())

In [13]:
workloads_names = [
    'Parsing Parquet Files', 
    'Query 1: Rides by Different Vendors',
    'Query 2: Mean Ride Prices',
    'Query 3: Rides by Vendor and Year',
    'Query 4: Rides by Vendor and Year and Distance, Sorted',
]
workloads

['parse', 'q1', 'q2', 'q3', 'q4']

In [14]:
sorted(benches['backend'].unique())

['cudf', 'modin', 'pandas', 'sqlite']

In [15]:
for workload, workload_name in zip(workloads, workloads_names):

    # For every workload generate a performance chart
    speedups_dfs = []
    baselines = filtered(benches, operation=workload, backend=backend_sources[0])['mean_duration'].to_numpy()
    for backend_source, backend_name in zip(backend_sources, backend_names):
        measurements = filtered(benches, operation=workload, backend=backend_source)['mean_duration'].to_numpy()
        shortest_len = min(len(baselines), len(measurements))
        speedups = baselines[:shortest_len] / measurements[:shortest_len]
        df = pd.DataFrame({
            'Speedup over Pandas': speedups.copy(),
            'Dataset Size, bytes': sizes[:shortest_len],
            'Backend': [backend_name]*shortest_len,
        })
        speedups_dfs.append(df)

    speedups_df = pd.concat(speedups_dfs, ignore_index=True)
    fig = px.line(speedups_df, 
        x='Dataset Size, bytes', 
        y='Speedup over Pandas', 
        color='Backend', 
        title=workload_name, 
        log_y=True,
        log_x=True,
    )
    fig.show()
    fig.write_image('results/chart_' + workload + '.svg')

    # For every workload, print a table
    # best_cpu = filtered(benches, operation=workload, backend='NumPy on MKL')['mean_duration'].to_numpy()
    # best_gpu = filtered(benches, operation=workload, backend='CuPy +CUB+TCs+TF32')['mean_duration'].to_numpy()
    # speedups = best_cpu / best_gpu
    # speedups = ['{0:.1f}x'.format(x) for x in speedups]
    # speedups = ' | '.join(speedups)
    