# Analysis of Experiments

In [None]:
from datetime import datetime, timezone
from pathlib import Path
import subprocess
from typing import Sequence

import numpy as np
import pandas as pd

In [None]:
DEBUG = False

EXPORT_PREFIX = 'analysis'
EXPORT_TAG = None

OUTPUT_DIR = 'output'


In [None]:
EXPORT_TAG = EXPORT_TAG if EXPORT_TAG is not None else datetime.now(tz=timezone.utc).strftime(r'%Y-%m-%dT%H%M%SZ')

EXPORT_PATH = Path(OUTPUT_DIR)
EXPORT_PATH.mkdir(parents=True, exist_ok=True)

print(f'Results written on directory <{EXPORT_PATH.as_posix()}>')
try:
    print('Git describe: ' + subprocess.check_output(['git', 'describe', '--dirty'], text=True).strip())
except subprocess.CalledProcessError as e:
    print(f'Git info not available: {e.stderr}')

DATA_DIRS: list[Path] = [p for p in EXPORT_PATH.glob('*') if p.is_dir()]
print('Reading data from:')
for p in DATA_DIRS:
    print(f'  - {p}')
print(f'Writing output in: {EXPORT_PATH.as_posix()}')

## Read data

Look for data in each directory in `DATA_DIRS`

In [None]:
import json


RAW_DATA: list[dict] = []

for data_dir in DATA_DIRS:
    data = []
    for rfile in data_dir.glob('*_results.json'):
        with rfile.open() as fp:
            result = json.load(fp)
            result['src'] = data_dir.as_posix()
            data.append(result)
    if len(data) < 1 and data_dir.joinpath('results.json').exists():
        with data_dir.joinpath('results.json').open() as fp:
            results = json.load(fp)
            for result in results:
                result['src'] = data_dir.as_posix()
                data.append(result)

    RAW_DATA.extend(data)

assert len(RAW_DATA) > 0, 'No data found in any of the directories'

RAW_DATA_df = pd.json_normalize(RAW_DATA)

RAW_DATA_df.sample(5)

In [None]:
RAW_DATA_df.columns

### Summarise data

In [None]:
SUMMARY = RAW_DATA_df[['src', 'id', 'generator']].assign(
    threshold=RAW_DATA_df.get('params.threshold'),
    randomised=((RAW_DATA_df['generator'] == 'PBLogGeneratorRandom') | RAW_DATA_df.get('params.randomise', default=False)),
    batches=RAW_DATA_df.get('params.batches'),
    model=RAW_DATA_df.get('params.model'),
    case_size=RAW_DATA_df.get('params.events'),
    log_size=RAW_DATA_df.get('params.traces'),
    generated=RAW_DATA_df.get('cases'),
    coverage=RAW_DATA_df['cases']/RAW_DATA_df['params.traces'],
    time=RAW_DATA_df.get('stats.times.total'),
    timeout=RAW_DATA_df.get('stats.timedout'),
    hamming=RAW_DATA_df.get('control_flow.hamming', 0).fillna(0),
    levenshtein=RAW_DATA_df.get('control_flow.levenshtein', 0).fillna(0),
)

SUMMARY.sample(5)

### Write summary to file

In [None]:
fname = EXPORT_PATH / f'{EXPORT_PREFIX}_summary_{EXPORT_TAG}.csv'
SUMMARY.to_csv(fname)
print(f'Summary data written on {fname}')

## Data analysis

Show results for each generator class and configuration, tables are summarized by log and case sizes. Timed out cases are highlighted in all tables.

In [None]:
from IPython.display import display

def show_table(key: str) -> None:
    group_keys = ['model', 'generator', 'threshold', 'randomised', 'src']
    for k, df in SUMMARY.groupby(by=group_keys, dropna=False, sort=True):
        model, gen, threshold, randomised, src = k
        print(f'Processing {gen}(model={model}, rnd={randomised}, threshold={threshold}) from {src}')
        timeout_style = lambda _: pd.pivot_table(df, index='log_size', columns='case_size', values='timeout', aggfunc='max').map(lambda x: 'background-color: pink' if x == True else '')
        pivot_df = pd.pivot_table(df, index='log_size', columns='case_size', values=key, aggfunc='max')
        display(pivot_df.style
                .set_caption(f'{gen}(model={model}, rnd={randomised}, threshold={threshold})<br>src:&nbsp;{src}')
                .apply(timeout_style, axis=None))

def select_data(model:str =None, gen:str =None, threshold: float=None, randomised:bool = None, src:str = None) -> pd.DataFrame:
    query = []
    if model is not None:
        query.append(f'model == "{model}"')
    if gen is not None:
        query.append(f'generator == "{gen}"')
    if threshold is not None:
        query.append(f'threshold == {threshold}')
    if randomised is not None:
        query.append(f'randomised == {randomised}')
    if src is not None:
        query.append(f'src == "{src}"')
    if len(query) > 0:
        return SUMMARY.query(' & '.join(query))
    else:
        return SUMMARY


### Coverage

In [None]:
show_table('coverage')

### Time

In [None]:
show_table('time')

### Levenshtein distance

In [None]:
show_table('levenshtein')

### Generated log size

In [None]:
show_table('generated')

### Timed out

In [None]:
show_table('timeout')

### Experiments id

In [None]:
show_table('id')