In [2]:
import sys
import os

# Link BigPy
sys.path.insert(0, '/home/av/AI-Lab/BigPy/')
from bigpy.bench_report import *  # nopep8
from bigpy.inmem_md import InMemMD  # nopep8
import bigpy.pretty_print as pp  # nopep8

In [3]:
import plotly.express as px
import pandas as pd
import numpy as np

In [13]:
backend_sources = ['pandas', 'modin', 'cudf']
backend_names = backend_sources # ['Pandas', 'Modin', 'CuDF']
benches = frame_google_benchmarks('benchmark.json')

In [14]:
benches['mean_duration'] = benches['seconds'] / benches['iterations']
workloads = benches['operation'].unique().tolist()
sizes = sorted(benches['size'].unique())

In [15]:
workloads_names = [
    'Parsing Parquet Files', 
    'Query 1: Rides by Different Vendors',
    'Query 2: Mean Ride Prices',
    'Query 3: Rides by Vendor and Year',
    'Query 4: Rides by Vendor and Year and Distance, Sorted',
]
workloads

['parse', 'q1', 'q2', 'q3', 'q4']

In [16]:
sorted(benches['backend'].unique())

['cudf', 'modin', 'pandas']

In [17]:
for workload, workload_name in zip(workloads, workloads_names):

    # For every workload generate a performance chart
    speedups_dfs = []
    baselines = filtered(benches, operation=workload, backend=backend_names[0])['mean_duration'].to_numpy()
    for backend in backend_names:
        measurements = filtered(benches, operation=workload, backend=backend)['mean_duration'].to_numpy()
        shortest_len = min(len(baselines), len(measurements))
        speedups = baselines[:shortest_len] / measurements[:shortest_len]
        df = pd.DataFrame({
            'Speedup over Pandas': speedups.copy(),
            'Dataset Size, bytes': sizes[:shortest_len],
            'backend': [backend]*shortest_len,
        })
        speedups_dfs.append(df)

    speedups_df = pd.concat(speedups_dfs, ignore_index=True)
    fig = px.line(speedups_df, 
        x='Dataset Size, bytes', 
        y='Speedup over Pandas', 
        color='backend', 
        title=workload_name, 
        log_y=True,
        log_x=True,
    )
    fig.show()
    fig.write_image('results/chart_' + workload + '.svg')

    # For every workload, print a table
    # best_cpu = filtered(benches, operation=workload, backend='NumPy on MKL')['mean_duration'].to_numpy()
    # best_gpu = filtered(benches, operation=workload, backend='CuPy +CUB+TCs+TF32')['mean_duration'].to_numpy()
    # speedups = best_cpu / best_gpu
    # speedups = ['{0:.1f}x'.format(x) for x in speedups]
    # speedups = ' | '.join(speedups)
    