# CAiSE 2025 experiments: testing generators

Uses the original code to generate positive logs and reports average Levenshtein and Hamming distance of the generated traces.

In [None]:
from Declare4Py.ProcessMiningTasks.LogGenerator.PositionalBased.PositionalBasedLogGenerator import PositionalBasedLogGenerator
from Declare4Py.ProcessMiningTasks.LogGenerator.PositionalBased.PositionalBasedLogGeneratorNG import PBLogGeneratorOrig, PBLGwrapper, PBLogGeneratorBaseline, PBLogGeneratorRandom, PBLogGeneratorHamming, PBLogGeneratorLevenshtein
from Declare4Py.ProcessMiningTasks.LogGenerator.PositionalBased.PositionalBasedModel import PositionalBasedModel


In [None]:
from datetime import datetime, timezone
import itertools
import json
from pathlib import Path

MAX_TRACES = [10]
MAX_EVENTS = [20]
NOISE = [0]
THRESHOLDS = [.3, .6]

SEED = b'\x81\x97u+'

MODEL_NAME = 'experimental_model_ivan 2'
MODEL_PATH = Path(f"{MODEL_NAME}.decl")

EXPORT_PATH = Path('output', 'tests_' + datetime.now(timezone.utc).isoformat(timespec='minutes'))
EXPORT_PATH.mkdir(parents=True, exist_ok=False)

model = PositionalBasedModel().parse_from_file(MODEL_PATH.as_posix())

EXPERIMENTS: dict[str, dict] = {}

# original code

for model_name, traces, events, noise in itertools.product([MODEL_NAME], MAX_TRACES, MAX_EVENTS, NOISE):
    params = {
        'model': model_name,
        'traces': traces,
        'events': events,
        'noise': noise
    }
    exp_args = {
        'init': {
            'total_traces': traces,
            'min_events': events,
            'max_events': events,
            'pb_model': model,
            'verbose': False},
        'run': {
            'equal_rule_split': True,
            'high_variability': False,
            'generate_negatives_traces': False,
            'positive_noise_percentage': noise,
            'negative_noise_percentage': noise,
            'append_results': False,}
    }
    exp_id = f'_{model_name}_{traces:04}_{events:03}_{noise:02}'
    EXPERIMENTS['orig' + exp_id] = {
        'class': PositionalBasedLogGenerator,
        'parameters': params,
        'args': exp_args,
        'description': 'Original code'
    }
    EXPERIMENTS['wrapper' + exp_id] = {
        'class': PBLGwrapper,
        'parameters': params,
        'args': exp_args,
        'description': 'Wrapper for original code, for testing purposes'
    }

# new code

for model_name, traces, events, noise in itertools.product([MODEL_NAME], MAX_TRACES, MAX_EVENTS, NOISE):
    params = {
        'model': model_name,
        'traces': traces,
        'events': events,
        'noise': noise
    }
    exp_args = {
        'init': {
            'total_traces': traces,
            'min_event': events,
            'max_event': events,
            'process_model': model,
            'log': None,
            'verbose': False,
            'seed': SEED},
        'run': {
            'equal_rule_split': True,
            'high_variability': False,
            'generate_negatives_traces': False,
            'positive_noise_percentage': noise,
            'negative_noise_percentage': noise,
            'append_results': False}}
    exp_id = f'_{model_name}_{traces:04}_{events:03}_{noise:02}'

    EXPERIMENTS['old' + exp_id] = {
        'class': PBLogGeneratorOrig,
        'parameters': params,
        'args': exp_args,
        'description': 'Reimplementation of original code'
    }
    EXPERIMENTS['baseline' + exp_id] = {
        'class': PBLogGeneratorBaseline,
        'parameters': params,
        'args': exp_args,
        'description': 'No attempt to introduce variability in the generated logs'
    }
    EXPERIMENTS['random' + exp_id] = {
        'class': PBLogGeneratorRandom,
        'parameters': params,
        'args': exp_args,
        'description': 'Uses clingo randomisation to generate different models'
    }

for model_name, traces, events, noise, threshold, randomise in itertools.product([MODEL_NAME], MAX_TRACES, MAX_EVENTS, NOISE, THRESHOLDS, [False, True]):
    params = {
        'model': model_name,
        'traces': traces,
        'events': events,
        'noise': noise,
        'threshold': threshold,
        'randomise': randomise
    }
    exp_args = {
        'init': {
            'total_traces': traces,
            'min_event': events,
            'max_event': events,
            'process_model': model,
            'log': None,
            'verbose': False,
            'seed': SEED,
            'threshold': threshold,
            'randomise': randomise},
        'run': {
            'equal_rule_split': True,
            'high_variability': False,
            'generate_negatives_traces': False,
            'positive_noise_percentage': noise,
            'negative_noise_percentage': noise,
            'append_results': False}}
    exp_id = f'_{model_name}_{traces:04}_{events:03}_{noise:02}_{int(threshold * 100):02}_{randomise}'

    EXPERIMENTS['hamming' + exp_id] = {
        'class': PBLogGeneratorHamming,
        'parameters': params,
        'args': exp_args,
        'description': 'Hamming distance threshold implemented in ASP'
    }
    EXPERIMENTS['levenshtein' + exp_id] = {
        'class': PBLogGeneratorLevenshtein,
        'parameters': params,
        'args': exp_args,
        'description': 'Levenshtein distance threshold implemented in ASP'
    }

def custom_json(obj):
    if isinstance(obj, type):
        return obj.__name__
    elif isinstance(obj, object):
        return f'obj({type(obj).__name__})'
    raise TypeError(f'Cannot serialize object of {type(obj)}')

with EXPORT_PATH.joinpath('experiments.json').open('w') as fp:
    json.dump(EXPERIMENTS, fp, default=custom_json, indent=2)

EXPERIMENTS

In [None]:
from typing import Union
import warnings

def create_generator(exp: dict) -> PositionalBasedLogGenerator:
    return exp['class'](**exp['args']['init'])


def run_generator(gen: Union[PositionalBasedLogGenerator, dict], exp: dict = None) -> PositionalBasedLogGenerator:
    if not isinstance(gen, PositionalBasedLogGenerator):
        if exp is None:
            exp = gen
        gen = create_generator(exp)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gen.run(**exp['args']['run'])
        return gen

In [None]:
GENERATORS = {eid: create_generator(e) for eid, e in EXPERIMENTS.items()}

In [None]:
import sys
from IPython import get_ipython
ipython = get_ipython()

for eid, gen, exp in ((k, GENERATORS[k], EXPERIMENTS[k]) for k in EXPERIMENTS.keys()):
    header = '-' * 5 + f' {eid} [{type(gen).__name__}] '
    print(header + '-' * (72 - len(header)))
    %time run_generator(gen, exp)


## Evaluate the variability

The average distances are based on the activity names only, payload are ignored (see the code for `results_to_log` function above)

In [None]:
from dataclasses import dataclass
import dataclasses
import itertools
import statistics
from typing import Callable, Hashable, Iterable, Optional, Sequence, Union

import Levenshtein
import pandas as pd

def results_to_seq(generator: PositionalBasedLogGenerator, columns: Sequence[str] = ["concept:name"]) -> Iterable[Sequence[Hashable]]:
    def case_key(key: pd.Series) -> pd.Series:
        return key.str.removeprefix('event_').astype('int64')

    for case, frame in generator.get_results_as_dataframe().groupby('case:concept:name'):
        case_df = frame.sort_values(by='concept:name:order', key=case_key)
        if len(columns) > 1:
            yield [tuple(r) for r in case_df[columns].to_numpy()]
        else:
            yield case_df[columns[0]].values.tolist()

@dataclass
class Distances(object):
    """Docstring for ClassName."""
    levenshtein: int
    hamming: int

    def asdict(self) -> dict:
        return dataclasses.asdict(self)

def average_distances_seq(traces: Iterable[Sequence[Hashable]], aggr: Callable=statistics.mean, normalise: Optional[int]=None) -> Distances:
    t1, t2 = itertools.tee(itertools.combinations(traces,2))
    def norm_val(v: int) -> Union[int, float]:
        return v if normalise is None else v/normalise
    return Distances(
        levenshtein=aggr(map(norm_val, (Levenshtein.distance(s1,s2) for s1,s2 in t1))),
        hamming=aggr(map(norm_val, (Levenshtein.hamming(s1,s2) for s1,s2 in t2))))

def average_distance(generator: PositionalBasedLogGenerator, columns: Sequence[str] = ["concept:name"], normalise: Optional[int]=None) -> Distances:
    return average_distances_seq(results_to_seq(generator, columns=columns), normalise=normalise)


In [None]:
def experiments_results(gens: dict[str, PositionalBasedLogGenerator], exps: dict[str, dict]) -> Iterable[dict]:

    def ensure_stats(gen: PositionalBasedLogGenerator) -> dict:
        try:
            return gen.get_running_stats()
        except:
            return {}

    return [{
        'eid': eid,
        'generator': type(g).__name__,
        'control_flow': average_distance(g, normalise=exps[eid]['parameters']['events']).asdict(),
        'data_flow': average_distance(g, columns=['concept:name', 'resource'], normalise=exps[eid]['parameters']['events']).asdict(),
        'stats': ensure_stats(g),
        'params': exps[eid]['parameters']
    } for eid,g in gens.items()]

results = experiments_results(GENERATORS, EXPERIMENTS)

with EXPORT_PATH.joinpath('results.json').open('w') as fp:
    json.dump(results, fp)

df = pd.json_normalize(experiments_results(GENERATORS, EXPERIMENTS))

df.to_csv(EXPORT_PATH.joinpath('results.csv'))

df
