In [None]:
import numpy as np
import pandas as pd
import pylab as pl
import matplotlib
import matplotlib.pyplot as plt

from functools import partial
from datetime import timedelta

In [None]:
%load_ext autoreload
%autoreload 1
%aimport analyze_simulation

import analyze_simulation as tools

In [None]:
from os import environ, path

data_dir = environ.get('data_dir', 'data')
print("Data dir: {}".format(data_dir))

In [None]:
operations = pd.read_csv(path.join(data_dir, 'operations_stats.csv'))

tools.preprocess_operations(operations)

max_time = max(operations['finish_time'] / 60.0 / 60.0)
print("Max finish time (unfiltered): {0:.3f}h".format(max_time))

In [None]:
print(operations.shape)

print("Total preempted jobs: {:,}". format(operations['preempted_job_count'].sum()))
print("Total jobs: {:,}". format(operations['job_count'].sum()))
print("Total preempted duration: {}s". format(operations['preempted_jobs_total_duration'].sum()))
print("Total jobs duration: {}s". format(operations['jobs_total_duration'].sum()))
# operations.head()

In [None]:
start_time_threshold = 0
end_time_threshold = 1e9
# start_time_threshold = 60 * 60 * 2
# end_time_threshold = 60 * 60 * 10

operations_filtered = tools.time_filter(operations, start_time_threshold, end_time_threshold)

In [None]:
for percentile, finish_time in tools.get_finish_time_percentiles(operations_filtered):
        print("{}% finish time: {:.3f}h".format(percentile, finish_time / 60.0 / 60.0))

In [None]:
(history, min_time, max_time) = tools.build_load_history(operations_filtered)
plt.plot(range(min_time, max_time + 1), history)

In [None]:
ranges = np.logspace(0, 5, 6)
tools.plot_job_count_distribution(ranges, operations_filtered)

In [None]:
tools.plot_differences(operations_filtered, weighted=True)

In [None]:
bins = tools.split_into_bins(operations_filtered, operations_filtered['job_count'], ranges)

for i in range(len(ranges)):
    left_bound = ranges[i]
    right_bound = np.inf if i + 1 >= len(ranges) else ranges[i + 1]
    print("Range: [{}, {}]".format(left_bound, right_bound))
    tools.plot_differences(bins[i])

In [None]:
tools.find_suspicious_operations(operations_filtered, 1000)

In [None]:
import matplotlib.pyplot as plt

from bisect import bisect_left

class discrete_cdf:
    def __init__(self, data):
        self._data = sorted(data) # must be sorted
        self._data_len = float(len(data))

    def __call__(self, point):
        return (len(self._data[:bisect_left(self._data, point)]) / self._data_len)

def plot_cdf(data, color, label):
    cdf = discrete_cdf(data.values)
    xvalues = np.arange(0, max(data))
    yvalues = [cdf(point) for point in xvalues]
    return plt.plot(xvalues, yvalues, color, label=label)

plt.figure(figsize=(18, 8))
plt.xscale('log')
plt.xlabel('duration', size=14)
plt.ylabel('CDF', size=14)
plt.grid()
real_cdf = plot_cdf(operations_filtered["real_duration"], "b", "real")
simulator_cdf = plot_cdf(operations_filtered["finish_time"] - operations_filtered["start_time"], "r", "simulator")

plt.legend(loc='upper left', prop={'size':16})
plt.show()

# Измерение утилизации ресурсов за все время симуляции

In [None]:
import ujson as json

utilization_data = pd.DataFrame.from_records(map(json.loads, open(path.join(data_dir, "cluster_utilization.json"))))
utilization_data["timestamp"] = utilization_data["timestamp"].map(tools.parse_timestamp)

earliest_time = utilization_data["timestamp"].min()
latest_time = utilization_data["timestamp"].max()

def get_utilization_slice(time_begin=earliest_time, time_end=latest_time):
    return utilization_data.query("@time_begin <= timestamp <= @time_end")

def get_resource_usage(data, resource):
    return data["resource_usage"].map(lambda row: row[resource])

def get_resource_limits(data, resource):
    return data["resource_limits"].map(lambda row: row[resource])

def plot_resource_info(resource, ax, time_begin=earliest_time, time_end=latest_time):
    data = get_utilization_slice(time_begin, time_end)
    
    usage = get_resource_usage(data, resource)
    limits = get_resource_limits(data, resource)
    max_value = limits.max()
    
    plt.title("{} utilization (max_value={})".format(resource, max_value))
    plt.xlabel("time")
    plt.ylabel(resource)
    plt.ylim(-0.1, 1.1)
    
    plt.plot(data["timestamp"], usage / max_value, "blue")
    plt.plot(data["timestamp"], limits / max_value, "orange")

    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%H:%M"))

In [None]:
fig = plt.figure(figsize=(16, 12))

plot_resource_info("cpu", plt.subplot(3, 2, 1))
plot_resource_info("user_memory", plt.subplot(3, 2, 2))
plot_resource_info("user_slots", plt.subplot(3, 2, 3))
plot_resource_info("memory", plt.subplot(3, 2, 4))
plot_resource_info("network", plt.subplot(3, 2, 5))

plt.tight_layout()
plt.show()

Числа, привиденные ниже -- это главным образом интегралы утилизации кластера.

Заметим, что, если не учитывать preeption, эти площади должны быть примерно равны вне зависимости от стратегии симулятора. Следовательно, они не могут быть использованы для оценки качества упаковки.

In [None]:
def print_utilization_integrals(data):
    timesteps_in_seconds = (data["timestamp"] - data["timestamp"].min()).map(lambda x: x.total_seconds())

    print("Total cpu usage: {:.0f} cpu * second".format(
        np.trapz(get_resource_usage(data, "cpu"), timesteps_in_seconds)))

    print("Total user_memory usage: {:.0f} GiB * second".format(
        np.trapz(get_resource_usage(data, "user_memory") / (1024 ** 3), timesteps_in_seconds)))

    print("Total network usage: {:.0f} slots * second".format(
        np.trapz(get_resource_usage(data, "network"), timesteps_in_seconds)))

    print("Total user_slots usage: {:.0f} slots * second".format(
        np.trapz(get_resource_usage(data, "user_slots"), timesteps_in_seconds)))

    print("Total user_slots usage w/o preemption: {:.0f} slots * second".format(
        (operations["jobs_total_duration"] - operations["preempted_jobs_total_duration"]).sum()))

    print("Preempted total user_slots usage: {:.0f} slots * second".format(
        operations["preempted_jobs_total_duration"].sum()))

In [None]:
print_utilization_integrals(utilization_data)

# Utilization breakdown

In [None]:
# Make sure that `$YT_SOURCE/python` is in your PYTHONPATH
from yt.scheduler_tools import scheduler_utilization
# Make sure that `ya-build` with compiled python3 yson bindings is in your PYTHONPATH
from yt import yson

assert yson.TYPE == "BINARY"

In [None]:
import os

event_log = "scheduler_event_log.txt"
assert os.path.isfile(event_log)

In [None]:
%%time
utilization_info = []
for entry in yson.load(open(event_log, "rb"), "list_fragment"):
    if entry["event_type"] != "nodes_info":
        continue
    entry["nodes"]
    timestamp = tools.parse_timestamp(entry["timestamp"])
    nodes = entry["nodes"]
    utilization_info.append({
        "timestamp": timestamp,
        "cpu": scheduler_utilization.get_cpu_stats(nodes).as_dict(),
        "user_memory": scheduler_utilization.get_memory_stats(nodes).as_dict(),
    })

In [None]:
%%time
utilization_info.sort(key=lambda x: x["timestamp"])
utilization_df = pd.DataFrame.from_records(utilization_info)

In [None]:
display(pd.DataFrame.from_records(utilization_df["cpu"]).head())

In [None]:
def plot_resource_utilization(resource, ax):
    timestamps = np.array(utilization_df["timestamp"], dtype=np.datetime64)
    data = pd.DataFrame.from_records(utilization_df[resource])
    if resource == "user_memory":
        gib = 1024**3
        data = data / gib

    max_value = np.max(data["available_limit"])
    data = data / max_value
    
    plt.title("{} utilization (max_value={})".format(resource, max_value))
    plt.xlabel("time")
    plt.ylabel(resource)
    plt.ylim(-0.1, 1.1)

    ax.plot(timestamps, data["available_limit"])
    ax.stackplot(timestamps,
                 data["fragmented"],
                 data["allocatable"],
                 data["unclaimed"],
                 data["used"],
                 data["not_schedulable"],
                 data["limit_on_invalid_nodes"],
                 labels=[
                    "fragmented",
                    "allocatable",
                    "unclaimed",
                    "used",
                    "not_schedulable",
                    "limit_on_invalid_nodes",
                ],
                colors=[
                    "orange",
                    "lime",
                    "blue",
                    "yellow",
                    "red",
                    "black",
                ])
    ax.legend(loc='upper left')
    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%H:%M"))

In [None]:
fig = plt.figure(figsize=(16, 12))
plot_resource_utilization("cpu", plt.subplot(2, 1, 1))
plot_resource_utilization("user_memory", plt.subplot(2, 1, 2))
plt.show()