In [4]:
import yaml

import pandas as pd
import plotly.express as px

# Read data

In [96]:
def get_results():
    """ Retrive results as a pandas dataframe """

    with open("results/reading.yaml", "r") as infile:
        data = yaml.load(infile, Loader=yaml.SafeLoader)
        
    dfs = []
    for dataset, subdata in data.items():
        for test, values in subdata.items():

            # Create a dataframe with the tests
            df = pd.DataFrame(values)
            df.index.name = "run_num"
            df = df.reset_index()

            # Add dataset and test names
            df["dataset"] = dataset
            df["test"] = test

            dfs.append(df)

    return pd.concat(dfs).reset_index(drop=True)

In [97]:
dfg = get_results()
dfg.head()

Unnamed: 0,run_num,pandas_read,pyarrow_ds_read,pyarrow_parquet_ds_read,pyarrow_single_read,dataset,test
0,0,6.225336,2.69885,3.992343,6.212367,dataset_0,filters
1,1,6.088428,2.340882,4.317913,6.130858,dataset_0,filters
2,2,6.00169,2.452182,4.313423,6.297936,dataset_0,filters
3,3,5.708707,2.544423,4.140233,6.754873,dataset_0,filters
4,4,5.922328,2.606034,4.193618,7.379676,dataset_0,filters


# Get averages

In [98]:
tests = ["pandas_read", "pyarrow_ds_read", "pyarrow_parquet_ds_read", "pyarrow_single_read"]
df = dfg.groupby(["dataset", "test"])[tests].mean().reset_index()
df

Unnamed: 0,dataset,test,pandas_read,pyarrow_ds_read,pyarrow_parquet_ds_read,pyarrow_single_read
0,dataset_0,filters,6.197915,3.046323,4.226456,6.224528
1,dataset_0,nofilter,8.8566,7.468612,7.46332,9.433437
2,dataset_1,filters,7.235872,0.464466,0.394429,7.429504
3,dataset_1,nofilter,5.349147,4.558274,4.739228,5.164837
4,dataset_2,filters,29.401111,0.58248,0.739624,29.645606
5,dataset_2,nofilter,22.70939,7.273302,8.621203,28.806552


# Plot results

In [99]:
dfa = df.melt(id_vars=["dataset", "test"], var_name="function")

px.bar(
    dfa,
    x="dataset",
    y="value",
    color="function",
    facet_row="test",
    barmode="group",
    height=800
)

In [100]:
px.bar(
    dfa[dfa["dataset"] == "dataset_1"],
    x="function",
    y="value",
    color="test",
    barmode="group",
)