In [None]:
import numpy as np
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import os

In [None]:
%load_ext autoreload
%autoreload 1
%aimport analyze_simulation

import analyze_simulation as tools

In [None]:
experiments = {
    "experimental": "history/hahn/run_2018-09-26_12-32/data",
    "stable": "history/hahn/run_2018-09-22_11-28/data",
    "production-experimental": "history/hahn/production-experimental/data",
    "production-stable": "history/hahn/production-stable/data",
}

In [None]:
operations = {}
operations_filtered = {}
for name, data_dir in experiments.items():
    operations[name] = pd.read_csv(os.path.join(data_dir, 'operations_stats.csv'))
    tools.preprocess_operations(operations[name])

    max_time = max(operations[name]['finish_time'] / 60.0 / 60.0)
    print("{0}: Max finish time (unfiltered): {1:.3f}h".format(name, max_time))
    
    start_time_threshold = 0
    end_time_threshold = 1e9
    # start_time_threshold = 60 * 60 * 2
    # end_time_threshold = 60 * 60 * 10

    operations_filtered[name] = tools.time_filter(operations[name], start_time_threshold, end_time_threshold)

In [None]:
for name, operations in operations_filtered.items():
    for percentile, finish_time in tools.get_finish_time_percentiles(operations):
        print("{}: {}% finish time: {:.3f}h".format(name, percentile, finish_time / 60.0 / 60.0))
    print()

In [None]:
import matplotlib.pyplot as plt

from bisect import bisect_left

class discrete_cdf:
    def __init__(self, data):
        self._data = sorted(data) # must be sorted
        self._data_len = float(len(data))

    def __call__(self, point):
        return (len(self._data[:bisect_left(self._data, point)]) / self._data_len)

def plot_cdf(data, label):
    cdf = discrete_cdf(data.values)
    xvalues = np.arange(0, max(data))
    yvalues = [cdf(point) for point in xvalues]
    return plt.plot(xvalues, yvalues, label=label)

plt.figure(figsize=(18, 8))
plt.xscale('log')
plt.xlabel('duration', size=14)
plt.ylabel('CDF', size=14)
plt.grid()

for name, ops in operations_filtered.items():
    simulator_cdf = plot_cdf(ops["finish_time"] - ops["start_time"], name)

plt.legend(loc='upper left', prop={'size':16})
plt.show()