In [334]:
from plotnine import *
import pandas
from sklearn import preprocessing
import itertools
from statistics import mean
from solarized import *
pandas.options.display.max_rows=10

Index(['index', 'bytes_memory', 'cpu_percent', 'bytes_sent', 'bytes_recv',
       'wall_time', 'exitcode', 'db_time', 'rep', 'net', 'scale', 'wflow',
       'optimizer'],
      dtype='object')

In [None]:
# 1.{5end_to_end} tpch 1,2,3,4,5 with scale 1, 10
# 2 {5module4} and 3. module4 | also {5module4mem} and {5module4net}
# 3.{5tpchmodin} tpch 1, 4, 5 with modin on scale 10
# 4.{5micro_scales} micros with scale 1, 10 | also {5micro_traffic}
# 5.{5micro_net} micro join, selection with net=wan and scale 1

In [None]:
df=pandas.read_feather('special-reports/5module4.feather')
df.columns

# CHECK

In [335]:
df['exitcode'].value_counts()

0    186285
1     21940
Name: exitcode, dtype: int64

In [336]:
df.query("exitcode == 1")['wflow'].unique()

array(['tpch1.py', 'q07.sql', 'q08.sql', 'q09.sql'], dtype=object)

# PREPROCESS

In [399]:
overview = df.groupby(by=factors).first().reset_index()
overview.columns

Index(['wflow', 'optimizer', 'net', 'scale', 'index', 'bytes_sent',
       'wall_time', 'db_time', 'gb_memory', 'cpu', 'gb_net',
       'procedural_time'],
      dtype='object')

In [None]:
overview.rename(columns={"db_time":"transfer_time"}, inplace=True)
overview["py_time"] = overview["shared_wall_time"] - overview["transfer_time"]

In [None]:
module4 = overview[["wflow", "optimizer", "py_time", "transfer_time"]]

# {5module4}

In [402]:
plot_df = module4.melt(id_vars=["wflow", "optimizer"], value_vars=["py_time", "transfer_time"],
                    value_name='measurement', var_name='var')


In [None]:
plot = (
        ggplot(plot_df, aes('optimizer', y='measurement', fill='var'))
        + geom_col(width=0.3)
        #+ scale_fill_manual(values=cdict) 
        + facet_wrap('wflow', scales='free')
        + xlab("Optimization Method")
        + ylab("Time (in sec.)")
        + theme(figure_size=(5, 2.5),
               subplots_adjust={'hspace': 1, 'wspace': 0.25})
    )

In [None]:
plot.save("specific-plots/5module4.png")
tikzplotlib.save('specific-plots/5module4.tikz', figure=plot)
display(plot)


# {5module4net}

In [None]:
overview['gb_net'] = overview['bytes_recv']/10**9

In [None]:
plot_df = overview[["wflow", "optimizer", "gb_net"]]

In [None]:
plot = (
    ggplot(plot_df, aes('optimizer', y='gb_net'))
    + geom_col(width=0.3)
    #+ scale_fill_manual(values=cdict)
    + xlab("Optimization Method")
    + ylab("Time (in sec.)")
    + facet_wrap('wflow', scales='free')
    + theme(figure_size=(5, 2.5),
           subplots_adjust={'hspace': 1, 'wspace': 0.25})
)

In [None]:
plot.save("specific-plots/5module4net.png")
tikzplotlib.save('specific-plots/5module4net.tikz', figure=plot)
display(plot)

# {5module4mem}

In [None]:
df["db_ratio"] = overview["transfer_time"] / overview["shared_wall_time"]

In [None]:
df['gb_memory'] = df['bytes_memory']/10**9

In [None]:
plot_df = df[["wflow", "optimizer", "gb_memory", "snapshot_idx"]]

In [None]:
plot_df['color']=C_BLUE
dummy_df=plot_df.copy()
dummy_df['color'] = C_GREEN
plot = (
ggplot(plot_df, aes('snapshot_idx', y='Memory usage in %'))
+ geom_area(aes(fill='color'))
+ geom_rect(aes(xmax='db_ratio', fill='color'), dummy_df, xmin=0,ymin=0,ymax=100,
           alpha=0.2)
+ facet_grid('optimizer ~ wflow')
+ theme(figure_size=(8, 2.5))
+ labs(y=None)
+ scale_fill_identity(name = 'Area colors', guide = 'legend',labels = ('Global Memory Consumption in %', 'Execution inside the RDBMS')) 
# + scale_colour_manual(name = 'the colour', 
#     values ={'black':'white','red':'blue'}, labels = ('c2','c1'))
)

In [None]:
plot.save("specific-plots/5module4mem.png")
tikzplotlib.save('specific-plots/5module4mem.tikz', figure=plot)
display(plot)