In [16]:
import sys
import os
import itertools
from typing import List, Optional

In [22]:
import plotly.express as px
import pandas as pd
import numpy as np
import humanize
import tabulate

In [18]:
benches_c10 = pd.read_json('results-comino-batch10.json', orient='records')
benches_c0 = pd.read_json('results-comino-full.json', orient='records')
benches_n2 = pd.read_json('results-nuc-batch2.json', orient='records')

In [19]:
benches_c10 = list(benches_c10.itertuples(index=None))
benches_c0 = list(benches_c0.itertuples(index=None))
benches_n2 = list(benches_n2.itertuples(index=None))

In [20]:
backends = set(d.backend for d in itertools.chain(benches_n2, benches_c10, benches_c0))
backends

{'CuDF', 'Modin', 'Pandas', 'PyArrow', 'PySpark', 'SQLite'}

In [39]:
backends_order = ['Pandas', 'Modin', 'CuDF', 'PySpark', 'PyArrow']
def find_backend_result(results: list, backend: str, operation: str) -> Optional[float]:
    for result in results:
        if result.backend == backend and result.operation == operation:
            secs_per_file = result.seconds / result.iterations
            return f'{secs_per_file:.2f} s/file'
    return None
def compare_operation(operation: str) -> List[List[str]]:
    result = []
    for backend in backends_order:
        row = []
        row.append(backend)
        row.append(find_backend_result(benches_n2, backend, operation))
        row.append(find_backend_result(benches_c10, backend, operation))
        row.append(find_backend_result(benches_c0, backend, operation))
        result.append(row)
    return result

In [40]:
print(tabulate.tabulate(compare_operation('Q1')))

-------  -----------  -----------  -----------
Pandas   4.87 s/file  1.38 s/file  0.59 s/file
Modin    5.94 s/file  1.13 s/file  0.06 s/file
CuDF     0.49 s/file  0.13 s/file
PySpark               0.55 s/file  1.27 s/file
PyArrow  0.63 s/file  0.44 s/file  0.21 s/file
-------  -----------  -----------  -----------


In [41]:
print(tabulate.tabulate(compare_operation('Q2')))

-------  -----------  -----------  -----------
Pandas   4.28 s/file  0.91 s/file  0.16 s/file
Modin    5.36 s/file  1.23 s/file  0.21 s/file
CuDF     0.23 s/file  0.08 s/file
PySpark               0.35 s/file
PyArrow  0.50 s/file  0.33 s/file  0.12 s/file
-------  -----------  -----------  -----------


In [42]:
print(tabulate.tabulate(compare_operation('Q3')))

-------  -----------  -----------  -----------
Pandas   6.87 s/file  1.98 s/file  1.23 s/file
Modin    5.83 s/file  1.23 s/file  0.13 s/file
CuDF     0.20 s/file  0.09 s/file
PySpark               0.50 s/file
PyArrow  0.66 s/file               0.29 s/file
-------  -----------  -----------  -----------


In [43]:
print(tabulate.tabulate(compare_operation('Q4')))

-------  -----------  -----------  -----------
Pandas   6.87 s/file  3.77 s/file  1.42 s/file
Modin    8.19 s/file  1.98 s/file  0.57 s/file
CuDF     0.22 s/file  0.10 s/file
PySpark               1.09 s/file
PyArrow  0.77 s/file               0.34 s/file
-------  -----------  -----------  -----------
