In [2]:
import sys
import os
import itertools
from typing import List, Optional

In [3]:
import plotly.express as px
import pandas as pd
import numpy as np
import humanize
import tabulate

In [4]:
benches_c10 = pd.read_json('results-comino-batch10.json', orient='records')
benches_c0 = pd.read_json('results-comino-full.json', orient='records')
benches_n2 = pd.read_json('results-nuc-batch2.json', orient='records')

In [5]:
benches_c10 = list(benches_c10.itertuples(index=None))
benches_c0 = list(benches_c0.itertuples(index=None))
benches_n2 = list(benches_n2.itertuples(index=None))

In [6]:
backends = set(d.backend for d in itertools.chain(benches_n2, benches_c10, benches_c0))
backends

{'CuDF', 'Dask+CuDF', 'Modin', 'Pandas', 'PyArrow', 'PySpark', 'SQLite'}

In [14]:
backends_order = ['Pandas', 'Modin', 'CuDF', 'PySpark', 'PyArrow', 'Dask+CuDF']
total_files = 125
def find_backend_result(results: list, backend: str, operation: str) -> Optional[float]:
    for result in results:
        if result.backend == backend and result.operation == operation:
            secs_per_file = result.seconds / result.iterations
            secs = total_files * secs_per_file
            return f'{secs:.2f} s'
    return None
def compare_operation(operation: str) -> List[List[str]]:
    result = []
    for backend in backends_order:
        row = []
        row.append(backend)
        row.append(find_backend_result(benches_n2, backend, operation))
        row.append(find_backend_result(benches_c10, backend, operation))
        row.append(find_backend_result(benches_c0, backend, operation))
        result.append(row)
    return tabulate.tabulate(result, headers=['Engine', 'Desktop (b=2)', 'Server (b=10)', 'Server (bulk)'])

In [15]:
print(compare_operation('Q1'))

Engine     Desktop (b=2)    Server (b=10)    Server (bulk)
---------  ---------------  ---------------  ---------------
Pandas     608.56 s         172.09 s         73.76 s
Modin      743.05 s         141.85 s         7.60 s
CuDF       61.58 s          16.65 s
PySpark    159.76 s         68.28 s          158.94 s
PyArrow    78.56 s          54.76 s          26.38 s
Dask+CuDF                                    309.75 s


In [16]:
print(compare_operation('Q2'))

Engine     Desktop (b=2)    Server (b=10)    Server (bulk)
---------  ---------------  ---------------  ---------------
Pandas     535.10 s         114.07 s         20.14 s
Modin      669.69 s         154.06 s         25.79 s
CuDF       28.38 s          10.56 s
PySpark    73.59 s          43.70 s
PyArrow    62.67 s          40.99 s          14.63 s
Dask+CuDF                                    166.51 s


In [17]:
print(compare_operation('Q3'))

Engine     Desktop (b=2)    Server (b=10)    Server (bulk)
---------  ---------------  ---------------  ---------------
Pandas     858.55 s         247.78 s         153.90 s
Modin      729.07 s         153.71 s         15.91 s
CuDF       25.51 s          11.23 s
PySpark    125.08 s         63.10 s
PyArrow    82.43 s                           36.18 s
Dask+CuDF                                    295.24 s


In [13]:
print(compare_operation('Q4'))

Engine     Desktop (b=2)    Server (b=10)    Server (bulk)
---------  ---------------  ---------------  ---------------
Pandas     6.87 s/file      3.77 s/file      1.42 s/file
Modin      8.19 s/file      1.98 s/file      0.57 s/file
CuDF       0.22 s/file      0.10 s/file
PySpark    5.35 s/file      1.09 s/file
PyArrow    0.77 s/file                       0.34 s/file
Dask+CuDF                                    3.26 s/file


To summarize the results, here are the winners across categories:

1. Consistent performance: **PyArrow**.
2. Peak performance: **CuDF**.
3. Energy efficiency: **CuDF**.
4. Accessability: **Modin**.

As for Spark and Dask, they are much harder to configure, less efficient and are harder to recommend.

TODO:

1. Redo bulk on server for: Pandas, Modin, PyArrow.
2. Redo batch-10 on server for PyArrow.