In [1]:
%matplotlib inline
import functools
import itertools
import multiprocess as mp

import modin.pandas as pd
from tqdm.notebook import tqdm, trange
import seaborn as sns
import matplotlib.pyplot as plt

import edgedroid.data as e_data
import edgedroid.util as util
from edgedroid.execution_times import *
from mpl_toolkits import mplot3d

fade_distance = 4
raw_data_params = e_data.load_default_exec_time_data()
data = preprocess_data(*raw_data_params, transition_fade_distance=fade_distance)
raw_data, *_ = raw_data_params
data

Unnamed: 0,run_id,seq,next_exec_time,delay,neuroticism,neuroticism_raw,impairment,transition,duration,duration_raw
0,134146,1,4.433,0.0,"[0.3333333333333333, 0.6666666666666666)",0.375,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",1
1,134146,2,2.956,0.0,"[0.3333333333333333, 0.6666666666666666)",0.375,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",2
2,134146,3,5.443,0.0,"[0.3333333333333333, 0.6666666666666666)",0.375,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",3
3,134146,4,5.206,0.0,"[0.3333333333333333, 0.6666666666666666)",0.375,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",4
4,134146,5,4.783,0.0,"[0.3333333333333333, 0.6666666666666666)",0.375,"[-inf, 1.0)",NoTransition,"[5.0, 10.0)",5
...,...,...,...,...,...,...,...,...,...,...
6715,137353,164,6.501,0.0,"[0.3333333333333333, 0.6666666666666666)",0.625,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",3
6716,137353,165,4.722,0.0,"[0.3333333333333333, 0.6666666666666666)",0.625,"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",4
6717,137353,166,3.475,0.0,"[0.3333333333333333, 0.6666666666666666)",0.625,"[-inf, 1.0)",NoTransition,"[5.0, 10.0)",5
6718,137353,167,2.296,0.0,"[0.3333333333333333, 0.6666666666666666)",0.625,"[-inf, 1.0)",NoTransition,"[5.0, 10.0)",6


In [2]:
data.groupby(["impairment", "transition", "duration"])["next_exec_time"].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
impairment,transition,duration,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"[-inf, 1.0)",Higher2Lower,"[0.0, 5.0)",585.0,5.926154,2.734934,0.032,4.338,5.542,6.896,37.923
"[-inf, 1.0)",NoTransition,"[0.0, 5.0)",670.0,5.48749,2.537998,1.867,3.85025,4.9995,6.5265,27.125
"[-inf, 1.0)",NoTransition,"[5.0, 10.0)",513.0,4.981975,2.527118,1.847,3.288,4.45,5.954,29.495
"[-inf, 1.0)",NoTransition,"[10.0, inf)",172.0,4.637052,2.525268,1.839,2.879,3.9505,5.331,16.52
"[1.0, 2.0)",Higher2Lower,"[0.0, 5.0)",373.0,6.417748,2.966883,1.739,4.65,5.892,7.67,29.504
"[1.0, 2.0)",Lower2Higher,"[0.0, 5.0)",104.0,6.205846,3.067246,2.09,4.3345,5.859,7.20475,26.269
"[1.0, 2.0)",NoTransition,"[0.0, 5.0)",320.0,5.247666,2.276933,1.218,3.7115,4.704,6.24925,16.438
"[1.0, 2.0)",NoTransition,"[5.0, 10.0)",400.0,5.722545,3.04135,1.44,4.0295,5.3085,6.7015,41.629
"[1.0, 2.0)",NoTransition,"[10.0, inf)",720.0,5.518681,2.487925,0.949,3.90825,5.1475,6.665,22.326
"[2.0, inf)",Lower2Higher,"[0.0, 5.0)",946.0,6.595813,3.857277,0.214,4.41725,5.8665,7.89275,56.017


In [3]:
raw_data

Unnamed: 0,run_id,seq,exec_time,delay,neuroticism
0,134146,1,4.433,0.0,0.375
1,134146,2,2.956,0.0,0.375
2,134146,3,5.443,0.0,0.375
3,134146,4,5.206,0.0,0.375
4,134146,5,4.783,0.0,0.375
...,...,...,...,...,...
6715,137353,164,6.501,0.0,0.625
6716,137353,165,4.722,0.0,0.625
6717,137353,166,3.475,0.0,0.625
6718,137353,167,2.296,0.0,0.625


In [4]:
# generate data for model
from typing import Type


num_traces = 100
run_ids = raw_data.run_id.unique()
run_len = len(raw_data[raw_data.run_id == np.random.choice(run_ids)].index)

with mp.Pool() as pool:
    dfs = deque()

    for model_cls in (TheoreticalExecutionTimeModel, EmpiricalExecutionTimeModel):
        model_name = model_cls.__name__

        for participant in tqdm(run_ids, desc=model_name, leave=True):
            raw_delays = raw_data[raw_data.run_id == participant].delay
            neuro = raw_data[raw_data.run_id == participant].neuroticism.values[0]

            # model = model_cls(data=data, neuroticism=neuro)
            for j, trace in enumerate(
                pool.imap(
                    lambda t: util.gen_model_trace(delays=t[0], model=t[1](data=t[2], neuroticism=t[3], transition_fade_distance=t[4])),
                    zip(
                        itertools.repeat(raw_delays, num_traces),
                        itertools.repeat(model_cls),
                        itertools.repeat(data),
                        itertools.repeat(neuro),
                        itertools.repeat(fade_distance)
                    )
                )
            ):
                trace["run_id"] = participant
                trace.index = trace.index.set_names(["step_index"])
                trace["model"] = model_name
                dfs.append(trace.reset_index())

results = pd.concat(dfs, ignore_index=True)

TheoreticalExecutionTimeModel:   0%|          | 0/40 [00:00<?, ?it/s]

Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-3:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Process ForkPoolWorker-11:
Process ForkPoolWorker-4:
Process ForkPoolWorker-8:

KeyboardInterrupt



In [None]:
results

In [None]:
df1 = results.drop(columns=[col for col in results.columns
                      if col not in ['step_index', 'model neuroticism (binned)',
                      'latest impairment', 'latest transition', 'run_id',
                      'current duration (binned)', 'exec_time', 'model']])\
    .rename(columns={
        'step_index': 'seq',
        'model neuroticism (binned)': 'neuroticism',
        'latest impairment': 'impairment',
        'latest transition': 'transition',
        'current duration (binned)': 'duration',
        'exec_time': 'next_exec_time'
    }
)
df1

In [None]:
df2 = data.copy()
df2["seq"] -= 1
df2["model"] = "Underlying data"

df2

In [None]:
exec_times = pd.concat((df1, df2), ignore_index=True)
exec_times

In [None]:
_df = exec_times.copy()
_df["state"] = _df[["impairment", "transition", "duration"]].apply(lambda r: tuple(r.values), axis=1)

fg = sns.displot(
    kind="kde",
    data=_df,
    col="state",
    col_wrap=3,
    x="next_exec_time",
    common_norm=False,
    # common_bins=True,
    # stat="density",
    hue="model",
    # bins=25,
)
plt.show()
del _df

# for g, df_sg in exec_times.groupby(['impairment', 'transition', 'duration']):
#     ax = sns.kdeplot(
#         # kind='kde',
#         data=df_sg,
#         x='next_exec_time',
#         hue='model',
#         # cumulative=True,
#         common_norm=False
#     )
#     ax.set_title(str(g))
#     plt.show()

In [None]:
# randomly sample participant, compare with model, repeat

samples = 500
run_ids = raw_data.run_id.unique()
rng = np.random.default_rng()

dfs = deque()

for model_cls in tqdm((EmpiricalExecutionTimeModel, TheoreticalExecutionTimeModel), desc="Models"):
    model_name = model_cls.__name__

    pbar = trange(samples)
    pbar.set_description(f"Samples for model {model_name}")
    for si in pbar:
        run = rng.choice(run_ids)
        df = raw_data[raw_data.run_id == run]

        # df holds data for a single participant
        delays = df.delay.values
        neuro = df.neuroticism.values[0]
        exec_times = np.concatenate((df.exec_time, np.array([np.nan])))

        model = model_cls(data=data, neuroticism=neuro, transition_fade_distance=fade_distance)
        trace = util.gen_model_trace(delays, model)

        trace["exec_time_rel"] = trace.exec_time / exec_times
        trace["base_exec_times"] = exec_times
        trace["run_id"] = run
        trace["sample"] = si
        trace["model"] = model_name
        dfs.append(trace)

trace_data = pd.concat(dfs, ignore_index=True)

In [None]:
fg = sns.displot(
    kind="kde",
    data=trace_data,
    x="exec_time_rel",
    common_norm=False,
    col="model",
    cumulative=True
)
fg.set_xlabels("Ratio between generated times\nand underlying empirical times")
fg.refline(x=1)
plt.show()

In [None]:
trace_data.groupby("model")["exec_time_rel"].describe(percentiles=[0.1, 0.25, 0.50, 0.75, 0.99, 0.999])

In [None]:
fg = sns.displot(
    kind="hist",
    data=trace_data,
    x="exec_time_rel",
    # common_norm=False,
    col="model",
    bins=100,
    # cumulative=True
)
fg.set_xlabels("Ratio between generated times\nand underlying empirical times")
fg.set(yscale='log')
plt.show()

In [None]:
trace_data

In [None]:
params = ["latest impairment", "latest transition", "current duration (binned)"]


trace_data.groupby(["latest impairment", "latest transition",
                    "current duration (binned)", "model"])["exec_time_rel"]\
    .describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.999])\
    .sort_values("max", ascending=False)

In [None]:
# neuro = raw_data.neuroticism.unique()
#
# for n in neuro:
#     lt = n - 0.15
#     ut = n + 0.15
#     print(neuro[(neuro > lt) & (neuro < ut)])

data.groupby("neuroticism")["run_id"].apply(lambda x: np.unique(x).size)