# Analysis of Experiments

## Read data

Look for data in each subdirectory of `OUTPUD_DIR`

In [None]:
from pathlib import Path

import pandas as pd

In [None]:
OUTPUT_DIR = Path('output')


DATA_DIRS: list[Path] = [p for p in OUTPUT_DIR.iterdir() if p.is_dir()]

In [None]:
import json


RAW_DATA: list[dict] = []

for data_dir in DATA_DIRS:
    data = []
    for rfile in data_dir.glob('*_results.json'):
        with rfile.open() as fp:
            result = json.load(fp)
            result['src'] = data_dir.name
            data.append(result)
    if len(data) < 1 and data_dir.joinpath('results.json').exists():
        with data_dir.joinpath('results.json').open() as fp:
            results = json.load(fp)
            for result in results:
                result['src'] = data_dir.name
                data.append(result)

    RAW_DATA.extend(data)

RAW_DATA_df = pd.json_normalize(RAW_DATA)

RAW_DATA_df.sample(5)

In [None]:
RAW_DATA_df.columns

### Summarise data

In [None]:
SUMMARY = RAW_DATA_df[['src', 'id', 'generator']].copy()

SUMMARY['threshold'] = RAW_DATA_df['params.threshold']
SUMMARY['randomised'] = ((RAW_DATA_df['generator'] == 'PBLogGeneratorRandom') | RAW_DATA_df['params.randomise'])
SUMMARY['model'] = RAW_DATA_df['params.model']
SUMMARY['case_size'] = RAW_DATA_df['params.events']
SUMMARY['log_size'] = RAW_DATA_df['params.traces']
SUMMARY['generated'] = RAW_DATA_df['cases']
SUMMARY['coverage'] = RAW_DATA_df['cases']/RAW_DATA_df['params.traces']
SUMMARY['time'] = RAW_DATA_df['stats.times.total']
SUMMARY['timeout'] = RAW_DATA_df['stats.timedout']
SUMMARY['hamming'] = RAW_DATA_df['control_flow.hamming']
SUMMARY['levenshtein'] = RAW_DATA_df['control_flow.levenshtein']

SUMMARY

### Write summary to file

In [None]:
from datetime import datetime


SUMMARY.to_csv(OUTPUT_DIR / f'summary_{datetime.now(tz=None).strftime('%Y-%m-%dT%H%M%S')}.csv')

## Data analysis

Show results for each generator class and configuration, tables are summarised by log and case sizes 

In [None]:
from IPython.display import display

def show_table(key: str) -> None:
    for k, df in SUMMARY.sort_values(by=['generator', 'randomised', 'threshold', 'src']).groupby(by=['src', 'generator', 'randomised', 'threshold'], dropna=False):
        timeout_style = lambda _: df.pivot(index='log_size', columns='case_size', values='timeout').map(lambda x: 'background-color: pink' if x else '')
        pivot_df = df.pivot(index='log_size', columns='case_size', values=key)
        display(pivot_df.style
                .set_caption(f'{k[1]}(rnd={k[2]}, threshold={k[3]}) src: {k[0]}')
                .apply(timeout_style, axis=None))

### Coverage

In [None]:
show_table('coverage')

### Time

In [None]:
show_table('time')

### Levenshtein distance

In [None]:
show_table('levenshtein')

### Generated log size

In [None]:
show_table('generated')

### Timed out

In [None]:
show_table('timeout')