# Collectl CPU Log Analysis

## Functionalities
- Plot CPU utilization graphs.

## Input
Log files are read from a directory in `../data`. This directory is assumed to have the following structure:
```
logs/
  [node-1]/
    collectl.tar.gz
  ...
  [node-n]/
    collectl.tar.gz
```

## Notebook Configuration

In [None]:
########## GENERAL
# Name of the directory in `../data`
EXPERIMENT_DIRNAME = "BuzzBlogBenchmark_[TIMESTAMP]"

########## CPU
# Analyzed metric (options: "user", "nice", "system", "wait", "irq", "soft",
# "steal", "idle", "total", "guest", "guest_n", "intrpt")
COLLECTL_CPU_METRIC = "total"
# Filter CPU cores
COLLECTL_CPU_CORES = {
    "node-0": range(20), "node-1": range(20), "node-2": range(20), "node-3": range(20),
    "node-4": range(10), "node-5": range(10), "node-6": range(10), "node-7": range(10),
    "node-8": range(10),
    "node-9": range(10),
    "node-10": range(10),
    "node-11": range(10),
    "node-12": range(10),
    "node-13": range(10),
    "node-14": range(10),
    "node-15": range(10),
    "node-16": range(10),
    "node-17": range(10),
    "node-18": range(10),
}

## Notebook Setup

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("..")))
from utils.utils import *

experiment_dirpath = os.path.join(os.path.abspath(""), "..", "data", EXPERIMENT_DIRNAME)

## Log Parsing & Processing

In [None]:
# Build data frame
cpu = pd.concat([df[2] for df in get_collectl_cpu_df(experiment_dirpath)])

In [None]:
# Extract experiment information
start_time = get_experiment_start_time(experiment_dirpath)
node_names = get_node_names(experiment_dirpath)

In [None]:
# (Re) Build columns
cpu["timestamp"] = cpu.apply(lambda r: (r["timestamp"] - start_time).total_seconds(), axis=1)
cpu["window_1000"] = cpu["timestamp"].round(0).multiply(1000)

In [None]:
# (Re) Create index
cpu.set_index("timestamp", inplace=True)
cpu.sort_index(inplace=True)

## CPU Monitoring

In [None]:
fig = plt.figure(figsize=(24, len(node_names) * 12))
for (i, node_name) in enumerate(sorted(node_names)):
    # Data frame
    df = cpu[(cpu["node_name"] == node_name) & (cpu["hw_no"].isin(COLLECTL_CPU_CORES[node_name]))].groupby(["window_1000"])[COLLECTL_CPU_METRIC].mean()
    # Plot
    ax = fig.add_subplot(len(node_names), 1, i + 1)
    ax.set_xlim((0, df.index.max()))
    ax.set_ylim((0, 100))
    ax.grid(alpha=0.75)
    df.plot(ax=ax, kind="line", title="%s - CPU Utilization" % node_name, xlabel="Time (millisec)", ylabel="%s (%%)" % COLLECTL_CPU_METRIC, grid=True, legend=False, yticks=range(0, 101, 10))

In [None]:
########## ZOOM IN
# Minimum time (in sec)
MIN_TIME = None
# Maximum time (in sec)
MAX_TIME = None

if MIN_TIME and MAX_TIME:
    fig = plt.figure(figsize=(24, len(node_names) * 12))
    for (i, node_name) in enumerate(sorted(node_names)):
        # Data frame
        df = cpu[(cpu["node_name"] == node_name) & (cpu.index >= MIN_TIME) & (cpu.index <= MAX_TIME) & (cpu["hw_no"].isin(COLLECTL_CPU_CORES[node_name]))].groupby(["timestamp", "hw_no"])[COLLECTL_CPU_METRIC].mean().unstack()
        # Plot
        ax = fig.add_subplot(len(node_names), 1, i + 1)
        ax.set_xlim((df.index.min(), df.index.max()))
        ax.set_ylim((0, 100))
        ax.grid(alpha=0.75)
        df.plot(ax=ax, kind="line", title="%s - CPU Utilization" % node_name, xlabel="Time (millisec)", ylabel="%s (%%)" % COLLECTL_CPU_METRIC, grid=True, legend=False, yticks=range(0, 101, 10))