# RPC Log Analysis

## Functionalities
- Plot number of RPCs per second.
- Plot instantaneous latency of RPCs.
- Plot latency distribution of RPCs.

## Input
Log files are read from a directory in `../data`. This directory is assumed to have the following structure:
```
logs/
  [node-1]/
    *_service*.tar.gz
    ...
    apigateway*.tar.gz
  ...
  [node-n]/
    *_service*.tar.gz
    ...
    apigateway*.tar.gz
```

## Notebook Configuration

In [None]:
########## GENERAL
# Name of the directory in `../data`
EXPERIMENT_DIRNAME = "BuzzBlogBenchmark_[TIMESTAMP]"

########## LATENCY
# Bin size
LATENCY_BIN_IN_MS = 1

## Notebook Setup

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("..")))
from utils.utils import *

experiment_dirpath = os.path.join(os.path.abspath(""), "..", "data", EXPERIMENT_DIRNAME)

## Log Parsing & Processing

In [None]:
# Build data frame
rpc = pd.concat([df[2] for df in get_rpc_df(experiment_dirpath)])

In [None]:
# Extract experiment information
start_time = get_experiment_start_time(experiment_dirpath)
max_latency_in_s = int(rpc["latency"].max()) + 1.0
function_names = sorted(rpc["function"].unique())

In [None]:
# (Re) Build columns
rpc["timestamp"] = rpc.apply(lambda r: (r["timestamp"] - start_time).total_seconds(), axis=1)
rpc["latency"] = rpc["latency"].multiply(1000)
rpc["window_1000"] = rpc["timestamp"].round(0).multiply(1000)
rpc["window_10"] = rpc["timestamp"].round(2).multiply(1000)

In [None]:
# (Re) Create index
rpc.set_index("timestamp", inplace=True)
rpc.sort_index(inplace=True)

## Number of RPCs per Second

In [None]:
fig = plt.figure(figsize=(24, len(function_names) * 12))
for (i, function) in enumerate(function_names):
    # Data frame
    df = rpc[(rpc["function"] == function)].groupby(["window_1000"])["window_1000"].count()
    df = df.reindex(range(0, int(df.index.max()) + 1, 1000), fill_value=0)
    # Plot
    ax = fig.add_subplot(len(function_names), 1, i + 1)
    ax.grid(alpha=0.75)
    ax.set_xlim((int(df.index.min()), int(df.index.max())))
    ax.set_ylim((0, int(df.values.max())))
    df.plot(ax=ax, kind="line", title="RPCs per second - %s" % function, xlabel="Time (millisec)", ylabel="Calls (count)", grid=True)
    plt.subplots_adjust(hspace=0.25)

In [None]:
########## ZOOM IN
# Minimum time (in sec)
MIN_TIME = None
# Maximum time (in sec)
MAX_TIME = None

if MIN_TIME and MAX_TIME:
    fig = plt.figure(figsize=(24, len(function_names) * 12))
    for (i, function) in enumerate(function_names):
        # Data frame
        df = rpc[(rpc["function"] == function) & (rpc.index >= MIN_TIME) & (rpc.index <= MAX_TIME)].groupby(["window_10"])["window_10"].count()
        if df.empty:
            continue
        df = df.reindex(range(int(df.index.min()), int(df.index.max()) + 1, 10), fill_value=0)
        # Plot
        ax = fig.add_subplot(len(function_names), 1, i + 1)
        ax.grid(alpha=0.75)
        ax.set_xlim((int(df.index.min()), int(df.index.max())))
        ax.set_ylim((0, int(df.values.max())))
        df.plot(ax=ax, kind="line", title="RPCs per second - %s" % function, xlabel="Time (millisec)", ylabel="Calls (count)", grid=True)
        plt.subplots_adjust(hspace=0.25)

## Instantaneous Latency of RPCs

In [None]:
fig = plt.figure(figsize=(24, len(function_names) * 12))
for (i, function) in enumerate(function_names):
    # Data frame
    df = rpc[(rpc["function"] == function)].groupby(["window_1000"])["latency"].max().reindex(range(0, int(rpc["window_1000"].max()) + 1, 1000), fill_value=0)
    # Plot
    ax = fig.add_subplot(len(function_names), 1, i + 1)
    ax.grid(alpha=0.75)
    ax.set_xlim((0, int(df.index.max())))
    ax.set_ylim((0, df.values.max()))
    df.plot(ax=ax, kind="line", title="Instantaneous Latency - %s" % function, xlabel="Time (millisec)", ylabel="Latency (millisec)", grid=True)
    plt.subplots_adjust(hspace=0.25)

In [None]:
########## ZOOM IN
# Minimum time (in sec)
MIN_TIME = None
# Maximum time (in sec)
MAX_TIME = None

if MIN_TIME and MAX_TIME:
    fig = plt.figure(figsize=(24, len(function_names) * 12))
    for (i, function) in enumerate(function_names):
        # Data frame
        df = rpc[(rpc["function"] == function) & (rpc.index >= MIN_TIME) & (rpc.index <= MAX_TIME)].groupby(["window_10"])["latency"].max()
        if df.empty:
            continue
        df = df.reindex(range(int(df.index.min()), int(df.index.max()) + 1, 10), fill_value=0)
        # Plot
        ax = fig.add_subplot(len(function_names), 1, i + 1)
        ax.grid(alpha=0.75)
        ax.set_xlim((int(df.index.min()), int(df.index.max())))
        ax.set_ylim((0, df.values.max()))
        df.plot(ax=ax, kind="line", title="Instantaneous Latency - %s" % function, xlabel="Time (millisec)", ylabel="Latency (millisec)", grid=True)
        plt.subplots_adjust(hspace=0.25)

## Latency Distribution of RPCs

In [None]:
fig = plt.figure(figsize=(24, len(function_names) * 12))
for (i, function) in enumerate(function_names):
    # Data frame
    df = rpc[(rpc["function"] == function)]
    if df.empty:
        continue
    df["latency_bin"] = df.apply(lambda r: int(r["latency"] // LATENCY_BIN_IN_MS), axis=1)
    p999 = df["latency"].quantile(0.999)
    p50 = df["latency"].quantile(0.50)
    # Plot
    ax = fig.add_subplot(len(function_names), 1, i + 1)
    ax.set_yscale("log")
    ax.grid(alpha=0.75)
    ax.set_xlim((0, (1000 // LATENCY_BIN_IN_MS) * max_latency_in_s))
    ax.set_xticks(range(int((1000 // LATENCY_BIN_IN_MS) * max_latency_in_s) + 1))
    ax.set_xticklabels(range(0, (int((1000 // LATENCY_BIN_IN_MS) * max_latency_in_s) + 1) * LATENCY_BIN_IN_MS, LATENCY_BIN_IN_MS))
    ax.axvline(x=p50 / LATENCY_BIN_IN_MS, ls="dotted", lw=5, color="darkorange")
    ax.text(x=p50 / LATENCY_BIN_IN_MS, y=10, s=" P50", fontsize=22, color="darkorange")
    ax.axvline(x=p999 / LATENCY_BIN_IN_MS, ls="dotted", lw=5, color="darkorange")
    ax.text(x=p999 / LATENCY_BIN_IN_MS, y=10, s=" P99.9", fontsize=22, color="darkorange")
    df["latency_bin"].plot(ax=ax, kind="hist", title="Latency Distribution - %s" % function, xlabel="Latency (milliseconds)", ylabel="Calls (count)", bins=range((1000 // LATENCY_BIN_IN_MS) * int(max_latency_in_s)), grid=True)
    plt.subplots_adjust(hspace=0.25)

## Statistics

In [None]:
for (i, function) in enumerate(function_names):
    df = rpc[(rpc["function"] == function)]
    print(function)
    print("  Number of RPCs/s")
    print("    Total:       %7d" % df.shape[0])
    print("    Avg:         %7.2f" % (df.shape[0] / (df.index.max() - df.index.min())))
    print("  Latency (ms)")
    print(" P99.99:         %7.2f" % (df["latency"].quantile(0.9999)))
    print("  P99.9:         %7.2f" % (df["latency"].quantile(0.999)))
    print("    P99:         %7.2f" % (df["latency"].quantile(0.99)))
    print("    P95:         %7.2f" % (df["latency"].quantile(0.95)))
    print("    P50:         %7.2f" % (df["latency"].quantile(0.50)))
    print("    Avg:         %7.2f" % (df["latency"].mean()))
    print("    Std:         %7.2f" % (df["latency"].std()))