# Real Traces Plotting/Analysis

Traces got from [GitHub](https://github.com/Azure/AzurePublicDataset/blob/master/AzureFunctionsDataset2019.md). See `dataset/REDAME.md`.

In [None]:
# Common imports.
from pathlib import Path

%matplotlib widget
import base

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import ipywidgets

Global options of the notebook:

* `data_file`: the full path of the data (a CSV file)

In [None]:
data_file = Path(
    "/home/emanuele/marl-dfaas/dataset/data/invocations_per_function_md.anon.d02.csv"
)  # The dataset from d01 to d14.

invocations = pd.read_csv(data_file)  # Read the data, takes time.

## Structure of a invocation file

In [None]:
print("Shape (rows, columns) =", invocations.shape)
print("First 10 columns =", list(invocations.columns[:10]))
print("Last 10 columns =", list(invocations.columns[-10:]))

The first three columns are hashes, consistent between all files:

* `HashOwner`: owner of the application. On owner can have multiple applications.
* `HashApp`: application. An application can have only one owner but many functions. Note two identical application (and functions) have two different hashes since they belong to different owners.
* `HashFunction`: the single function.
* `Trigger`: what causes the function execution.

The remaining columns are how many invocations there were for each minute in a single 24-hours day.

## Plot of a generic trace

Plots a single trace from the given data file. The trace can be selected by its hash (full or partial) or by the index inside the data file.

In [None]:
def plot_function():
    fn_idx = ipywidgets.BoundedIntText(
        value=300,
        min=0,
        max=1000,
        description="Function index:",
        disabled=False,
        style={"description_width": "initial"},
    )
    info_box = ipywidgets.Output()

    # Do not display the plot immediately, otherwise they will be displayed outside the AppLayout widget.
    with plt.ioff():
        fig = plt.figure(layout="constrained")
    fig.canvas.header_visible = False
    ax = fig.subplots()

    # Creates and displays the plot. Called after each update of the user input.
    def make_plot():
        trace = invocations.iloc[[fn_idx.value]]
        assert trace.shape[0] == 1, "Trace must be unique"

        # Extract the first columns and the invocations columns ("1" -> "1440")
        owner, app, func, trigger = trace.iloc[0][:"Trigger"]
        invocs = trace.iloc[0]["1":]

        with info_box:
            # Clear the previous output text in the box.
            info_box.clear_output()
            print("Owner =", owner)
            print("Application =", app)
            print("Function =", func)
            print("Trigger =", trigger)

        # The axis must be cleared to place the new bars.
        ax.clear()

        minutes_idx = np.arange(1, len(invocs) + 1)
        ax.bar(minutes_idx, invocs)

        ax.set_title(f"Function invocations (function index = {fn_idx.value})")
        ax.set_ylabel("Invocations")
        ax.set_xlabel("Minute")

        ax.grid(axis="both")
        ax.set_axisbelow(True)

        fig.canvas.draw_idle()
        fig.canvas.flush_events()

    # Make the initial plot with the default value.
    make_plot()

    # Link the input widget and the plotting function.
    fn_idx.observe(lambda change: make_plot(), names="value")

    return ipywidgets.AppLayout(
        header=fn_idx,
        # Put the plot below the info box.
        center=ipywidgets.VBox([info_box, fig.canvas]),
        pane_heights=[0, 6, 0],
    )


plot_function()

## Trigger distribution

There are many triggers supported by Azure Functions, but in the dataset they are grouped into the following groups:

* `http` (HTTP)
* `timer` (Timer)
* `event` (Event Hub, Event Grid)
* `queue` (Service Bus, Queue Storage, RabbitMQ, Kafka, MQTT)
* `storage` (Blob Storage, CosmosDB, Redis, File)
* `orchestration` (Durable Functions: activities, orcherstration)
* `others` (all other triggers)

Note that I'm only interested in functions triggered by `http` requests. The analysis of these is in the next section.

In [None]:
def plot_function():
    dataset_idx = ipywidgets.BoundedIntText(
        value=1,
        min=1,
        max=14,
        description="Dataset file:",
        disabled=False,
        style={"description_width": "initial"},
    )
    info_box = ipywidgets.Output()

    # Do not display the plot immediately, otherwise they will be displayed outside the AppLayout widget.
    with plt.ioff():
        fig = plt.figure(layout="constrained")
    fig.canvas.header_visible = False
    ax = fig.subplots()

    prefix = Path("/home/emanuele/marl-dfaas/dataset/data/")  # The dataset directory.

    datasets = {}  # Save the dataset in a cache.

    # Creates and displays the plot. Called after each update of the user input.
    def make_plot():
        # Get the dataset.
        dataset_path = prefix / Path(
            f"invocations_per_function_md.anon.d{dataset_idx.value:02}.csv"
        )
        if dataset_path not in datasets:
            datasets[dataset_path] = pd.read_csv(
                dataset_path
            )  # Read the data, takes time.
        invocations = datasets[dataset_path]

        trigger_count = invocations.loc[:, "Trigger"].value_counts()

        with info_box:
            # Clear the previous output text in the box.
            info_box.clear_output()
            print(trigger_count)

        # The axis must be cleared to place the new bars.
        ax.clear()

        ax.bar(trigger_count.index, trigger_count)

        ax.set_title(f"Trigger distribution (dataset idx = {dataset_idx.value:02})")
        ax.set_ylabel("Functions")
        ax.set_xlabel("Trigger")

        ax.grid(axis="both")
        ax.set_axisbelow(True)

        fig.canvas.draw_idle()
        fig.canvas.flush_events()

    # Make the initial plot with the default value.
    make_plot()

    # Link the input widget and the plotting function.
    dataset_idx.observe(lambda change: make_plot(), names="value")

    return ipywidgets.AppLayout(
        header=dataset_idx,
        # Put the plot below the info box.
        center=ipywidgets.VBox([info_box, fig.canvas]),
        pane_heights=[0, 6, 0],
    )


plot_function()

## Sum, mean and std of invocations

In [None]:
http = invocations[invocations["Trigger"] == "http"]

header = http.loc[:, :"Trigger"]  # Extract the first four columns (the "header").
values = http.loc[:, "1":].agg(
    ["sum", "mean", "std"], axis=1
)  # Calculate some stats for the invocations.

stats = header.join(values)  # Rebuild the dataframe.

In [None]:
for metric in ["sum", "mean", "std"]:
    fig = plt.figure(layout="constrained")
    fig.canvas.header_visible = False
    ax = fig.subplots()

    func_idx = np.arange(http.shape[0])

    # Required since there is too much variation between functions.
    ax.set_yscale("log")

    ax.bar(func_idx, stats[metric])

    ax.set_title(f"{metric.capitalize()} of invocations per http function")
    ax.set_ylabel("Total invocations")
    ax.set_xlabel("Function index")

    ax.grid(axis="both")
    ax.set_axisbelow(True)  # By default the axis is over the content.

## Invocations distribution of a single trace

In [None]:
def plot_trace_dist(dataset, dataset_name):
    fn_idx = ipywidgets.BoundedIntText(
        value=310,
        min=0,
        max=len(dataset),
        description="Function index:",
        style={"description_width": "initial"},
    )
    info_box = ipywidgets.Output()

    # Do not display the plot immediately, otherwise it will be
    # displayed outside the AppLayout widget.
    with plt.ioff():
        fig = plt.figure(layout="constrained")
    fig.canvas.header_visible = False
    ax = fig.subplots()

    # Creates and displays the plot. Called at each user input.
    def make_plot():
        trace = dataset.iloc[[fn_idx.value]]

        # Extract the first columns and the invocations columns ("1" -> "1440")
        owner, app, func, trigger = trace.iloc[0][:"Trigger"]
        invocs = trace.iloc[0]["1":]
        freqs = invocs.value_counts()

        with info_box:
            # Clear the previous output text in the box.
            info_box.clear_output()
            print("Owner hash =", owner)
            print("Application hash =", app)
            print("Function hash =", func)
            print("Trigger =", trigger)

        # The axis must be cleared to place the new bars.
        ax.clear()

        # The X-axis (freqs_idx) and the values (freqs) must have the same length.
        freqs_idx = np.arange(freqs.index.max() + 1)
        freqs = freqs.reindex(range(freqs.index.max() + 1), fill_value=0)
        ax.bar(freqs_idx, freqs)

        ax.set_title(
            f"Function invocations distribution (dataset {dataset_name}) (function index = {fn_idx.value})"
        )
        ax.set_ylabel("Frequency")
        ax.set_xlabel("Invocations")
        # Force the X-axis to have integer values (float by default).
        ax.xaxis.get_major_locator().set_params(integer=True)

        ax.grid(axis="both")
        ax.set_axisbelow(True)

        fig.canvas.draw_idle()
        fig.canvas.flush_events()

    # Make the initial plot with the default value.
    make_plot()

    # Link the input widget and the plotting function.
    fn_idx.observe(lambda change: make_plot(), names="value")

    return ipywidgets.AppLayout(
        header=fn_idx,
        center=ipywidgets.VBox([info_box, fig.canvas]),
        pane_heights=[0, 6, 0],
    )


plot_trace_dist(http, "http")