# Process performance test log
In this notebook we process the performance test results generated by `Performance_test.py`. 

Article: https://towardsdatascience.com/is-something-better-than-pandas-when-the-dataset-fits-the-memory-7e8e983c4fe5

In [1]:
import plotly.express as px
import pandas as pd

In [2]:
df = pd.read_csv("l.log", sep="|", header=None, names=["timestamp","process","step","time"])
j = pd.read_csv("julia.csv", sep="|", header=None, names=["timestamp","process","step","time"])

In [3]:
df = pd.concat([df,j])

In [4]:
df["process"].unique()

array(['pandas', 'vaex', 'dask', 'spark', 'julia-first', 'julia',
       'julia-4-first', 'julia-4'], dtype=object)

In [5]:
d = df[df["process"]=="spark"]

In [None]:
px.bar(d, x="step", y="time")

In [7]:
res = df.groupby(["process","step"])["time"].mean().reset_index()

In [None]:
# Pandas vs Dask
r = res[res["process"].isin(["pandas","dask"])]
fig = px.bar(r, color="process", y="time", x="step", barmode="group", 
             title="Pandas vs Dask", 
             category_orders={"process":["pandas","dask"]}, 
             color_discrete_sequence=["blue","forestgreen"])
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':["load_transactions","load_identity","merge"]})

In [None]:
# Pandas vs Spark
r = res[res["process"].isin(["pandas","spark"])]
fig = px.bar(r, color="process", y="time", x="step", barmode="group", 
             title="Pandas vs Spark", 
             category_orders={"process":["pandas","spark"]}, 
             color_discrete_sequence=["blue","forestgreen"])
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':["load_transactions","load_identity","merge"]})

In [None]:
# Pandas vs Vaex
r = res[res["process"].isin(["pandas","vaex"])]
fig = px.bar(r, color="process", y="time", x="step", barmode="group", title="Pandas vs Vaex", color_discrete_sequence=["blue","forestgreen"])
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':["load_transactions","load_identity","merge"]})

In [None]:
# Pandas vs Julia
processes = ["pandas","julia-first","julia","julia-4-first","julia-4"]
r = res[res["process"].isin(processes)]
fig = px.bar(r, color="process", y="time", x="step", 
             barmode="group", title="Pandas vs Julia", 
             category_orders={"process":processes}, 
             color_discrete_sequence=["blue","lightgreen","forestgreen","orchid","darkorchid"])
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':["load_transactions","load_identity","merge"]})