# Filter real traces

The traces from `real_traces.ipynb` must be handled to be used in the DFaaS environment. This notebook shows the operations and the analysis of a single dataset file.

The main operations are:

1. Extract only the functions triggered by `http`.
2. Reshape the functions by time and invocations.
3. Select a subset of "useful/informative" functions.

In [None]:
# Common imports.
from pathlib import Path

%matplotlib widget
import base

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import ipywidgets

Global options of the notebook:

* `data_file`: the full path of the data (a CSV file)

In [None]:
data_file = Path(
    "/home/emanuele/marl-dfaas/dataset/data/invocations_per_function_md.anon.d01.csv"
)  # The dataset from d01 to d14.

invocations = pd.read_csv(data_file)  # Read the data, takes time.

## 1. Filter http functions

In [None]:
http = invocations[invocations["Trigger"] == "http"].drop(columns="Trigger")

## 2. Reshape the functions

In [None]:
# Function to interactively display the invocations of a single function before (original) and after (scaled).
# The function index is selected by the user.


def plot_single_function(original, scaled):
    fn_idx_widget = ipywidgets.BoundedIntText(
        value=300, min=0, max=len(original), description="Function:", disabled=False
    )

    # Do not display the plots immediately, otherwise they will be displayed outside the AppLayout widget.
    with plt.ioff():
        fig_orig = plt.figure(layout="constrained")
        fig_scaled = plt.figure(layout="constrained")
    fig_orig.canvas.header_visible = False
    ax_orig = fig_orig.subplots()
    fig_scaled.canvas.header_visible = False
    ax_scaled = fig_scaled.subplots()

    # Creates and displays the plots. Called after each update of the function index.
    def make_plot(fn_idx):
        for dataset in ["original", "scaled"]:
            if dataset == "original":
                fig, ax, traces = fig_orig, ax_orig, original
            else:
                fig, ax, traces = fig_scaled, ax_scaled, scaled

            # The axis must be cleared to place the new bars.
            ax.clear()

            trace = traces[fn_idx]
            minutes_idx = np.arange(1, len(trace) + 1)
            ax.bar(minutes_idx, trace)

            ax.set_title(
                f"Function invocations ({dataset}) (function index = {fn_idx})"
            )
            ax.set_ylabel("Invocations")
            ax.set_xlabel("Minute")

            ax.grid(axis="both")
            ax.set_axisbelow(True)

            fig.canvas.draw_idle()  # Must be draw_idle not draw with multiple figures.
            fig.canvas.flush_events()

    # Make the initial plot with the default value.
    make_plot(fn_idx_widget.value)

    # Link the input widget and the plotting function.
    fn_idx_widget.observe(lambda change: make_plot(change.new), names="value")

    return ipywidgets.AppLayout(
        header=fn_idx_widget,
        # Put the two plots in vertical.
        center=ipywidgets.VBox([fig_orig.canvas, fig_scaled.canvas]),
        pane_heights=[0, 6, 0],
    )

### Reshape by time

The original dataset has invocations per minute in 24 hours for each function (1440 columns). Instead, the DFaaS environment expects a window of 5 minutes in 24 hours (288 steps). This is because we want to have time to adjust the load balancing distribution.

So every 5 steps in the original dataset are merged into one value.

In [None]:
invocs = http.loc[:, "1":].to_numpy()  # Get as numpy array.
rows, columns = invocs.shape
groups = columns // 5
columns = 5
invocs_scaled = invocs.reshape(rows, groups, columns).sum(axis=2)

#### Plot of a single function

In [None]:
plot_single_function(invocs, invocs_scaled)

### Reshape by invocations

In [None]:
def interp(array):
    src_xp = (array.min(), array.max())
    dst_fp = (0, 150)  # TODO: Get dinamically.
    if (diff := np.diff(src_xp)) < 0:
        assert False, f"Values can't be negative: {src_xp = }"
    elif diff == 0:
        # This array is a flat line.
        array.fill(dst_fp[1])
        return array
    return np.interp(array, src_xp, dst_fp).astype(np.int32)


invocs_final = np.apply_along_axis(interp, axis=1, arr=invocs_scaled)

#### Plot of a single function

In [None]:
plot_single_function(invocs_scaled, invocs_final)

#### Plot of two functions

The method I used to reshape the number of calls has a major problem: since I use a linear interpolation, if there is an outlier in the calls, all other values are small compared to another function without a large outlier.

In [None]:
def plot_two_functions(original, scaled):
    fn_a_idx_widget = ipywidgets.BoundedIntText(
        value=300,
        min=0,
        max=len(original),
        description="Function A:",
        disabled=False,
        style={"description_width": "initial"},
    )
    fn_b_idx_widget = ipywidgets.BoundedIntText(
        value=150,
        min=0,
        max=len(original),
        description="Function B:",
        disabled=False,
        style={"description_width": "initial"},
    )
    show_fn_a_widget = ipywidgets.Checkbox(
        value=True, description="Show A", disabled=False, indent=False
    )
    show_fn_b_widget = ipywidgets.Checkbox(
        value=True, description="Show B", disabled=False, indent=False
    )

    # Do not display the plots immediately, otherwise they will be displayed outside the AppLayout widget.
    with plt.ioff():
        fig_orig = plt.figure(layout="constrained")
        fig_scaled = plt.figure(layout="constrained")
    fig_orig.canvas.header_visible = False
    ax_orig = fig_orig.subplots()
    fig_scaled.canvas.header_visible = False
    ax_scaled = fig_scaled.subplots()

    # Creates and displays the plots. Called after each update of the function index.
    def make_plot():
        fn_a_idx, fn_b_idx = fn_a_idx_widget.value, fn_b_idx_widget.value
        show_a, show_b = show_fn_a_widget.value, show_fn_b_widget.value
        for dataset in ["original", "scaled"]:
            if dataset == "original":
                fig, ax, traces = fig_orig, ax_orig, original
            else:
                fig, ax, traces = fig_scaled, ax_scaled, scaled

            # The axis must be cleared to place the new bars.
            ax.clear()

            if show_a:
                trace_a = traces[fn_a_idx]
                minutes_idx = np.arange(1, len(trace_a) + 1)
                ax.bar(minutes_idx, trace_a, color="r", label="A")
            if show_b:
                trace_b = traces[fn_b_idx]
                minutes_idx = np.arange(1, len(trace_b) + 1)
                ax.bar(minutes_idx, trace_b, color="b", label="B")

            ax.set_title(
                f"Function invocations ({dataset}) (function index A = {fn_a_idx}, function index B = {fn_b_idx})"
            )
            ax.set_ylabel("Invocations")
            ax.set_xlabel("Minute")

            ax.legend()
            ax.grid(axis="both")
            ax.set_axisbelow(True)

            fig.canvas.draw_idle()  # Must be draw_idle not draw with multiple figures.
            fig.canvas.flush_events()

    # Make the initial plot with the default value.
    make_plot()

    # Link the input widgets and the plotting function.
    fn_a_idx_widget.observe(lambda change: make_plot(), names="value")
    fn_b_idx_widget.observe(lambda change: make_plot(), names="value")
    show_fn_a_widget.observe(lambda change: make_plot(), names="value")
    show_fn_b_widget.observe(lambda change: make_plot(), names="value")

    grid = ipywidgets.TwoByTwoLayout(
        top_left=fn_a_idx_widget,
        top_right=show_fn_a_widget,
        bottom_left=fn_b_idx_widget,
        bottom_right=show_fn_b_widget,
    )

    return ipywidgets.AppLayout(
        header=grid,
        # Put the two plots in vertical.
        center=ipywidgets.VBox([fig_orig.canvas, fig_scaled.canvas]),
        pane_heights=[0.5, 6, 0],
    )


plot_two_functions(invocs_scaled, invocs_final)