# Collectl Dsk Log Analysis

## Functionalities
- Plot disk I/O utilization graphs.

## Input
Log files are read from a directory in `../data`. This directory is assumed to have the following structure:
```
logs/
  [node-1]/
    collectl.tar.gz
  ...
  [node-n]/
    collectl.tar.gz
```
A tarball `collectl.tar.gz` contains log files. The log file extension identifies the type of resource monitored:
- `.cpu.gz`: CPU monitoring log file.
- `.numa.gz`: memory monitoring log file.
- `.dsk.gz`: disk I/O monitoring log file.

## Notebook Configuration

In [None]:
########## GENERAL
# Name of the directory in `../data`
EXPERIMENT_DIRNAME = "BuzzBlogBenchmark_[TIMESTAMP]"
# Ramp up duration (in sec)
RAMP_UP_DURATION = 180
# Ramp down duration (in sec)
RAMP_DOWN_DURATION = 180

########## DISK I/O
# Analyzed metric (options: "name", "reads", "rmerge", "rkbytes", "waitr", "writes", "wmerge", "wkbytes", "waitw",
# "request", "quelen", "wait", "svctim", "util")
COLLECTL_DSK_METRIC = "writes"
# Filter disks
COLLECTL_DISKS = None

## Notebook Setup

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("..")))
from utils.utils import *

experiment_dirpath = os.path.join(os.path.abspath(""), "..", "data", EXPERIMENT_DIRNAME)

## Log Parsing & Processing

In [None]:
# Build data frame
dsk = pd.concat([df[2] for df in get_collectl_dsk_df(experiment_dirpath)])

In [None]:
# Filter data frames
if COLLECTL_DISKS:
    dsk = dsk[(dsk["hw_no"].isin(COLLECTL_DISKS))]

In [None]:
# Extract experiment information
start_time = get_experiment_start_time(experiment_dirpath)
node_names = get_node_names(experiment_dirpath)

In [None]:
# (Re) Build columns
dsk["timestamp"] = dsk.apply(lambda r: (r["timestamp"] - start_time).total_seconds(), axis=1)
dsk["window"] = dsk.apply(lambda r: int(r["timestamp"]), axis=1)

In [None]:
# (Re) Create index
dsk.set_index("timestamp", inplace=True)
dsk.sort_index(inplace=True)

## Disk Monitoring

In [None]:
# Plot disk utilization (1-sec granularity)
fig = plt.figure(figsize=(24, len(node_names) * 12))
for (i, node_name) in enumerate(node_names):
    df = dsk[(dsk["node_name"] == node_name)]
    df = df.groupby(["window"])[COLLECTL_DSK_METRIC].mean()
    ax = fig.add_subplot(len(node_names), 1, i + 1)
    ax.set_xlim((df.index.min(), df.index.max()))
    ax.grid(alpha=0.75)
    df.plot(ax=ax, kind="line", title="%s - Disk Utilization" % node_name, xlabel="Time (seconds)",
            ylabel="%s" % COLLECTL_DSK_METRIC, grid=True)

In [None]:
########## LOCAL CONFIG
# Minimum time (in seconds)
MIN_TIME = None
# Maximum time (in seconds)
MAX_TIME = None

# Plot disk utilization (millisec granularity)
if MIN_TIME and MAX_TIME:
    fig = plt.figure(figsize=(24, len(node_names) * 12))
    for (i, node_name) in enumerate(node_names):
        df = dsk[(dsk["node_name"] == node_name)]
        df = df[(df.index >= MIN_TIME) & (df.index <= MAX_TIME)]
        df = df.groupby(["timestamp", "hw_no"])[COLLECTL_DSK_METRIC].max()
        df = df.unstack()
        ax = fig.add_subplot(len(node_names), 1, i + 1)
        ax.set_xlim((df.index.min(), df.index.max()))
        ax.grid(alpha=0.75)
        df.plot(ax=ax, kind="line", title="%s - Disk Utilization" % node_name, xlabel="Time (seconds)",
                ylabel="%s" % COLLECTL_DSK_METRIC, grid=True, legend=True)