# CAiSE 2025 experiments: original code

Uses the original code to generate positive logs and reports average Levenshtein and Hamming distance of the generated traces.

In [None]:
from Declare4Py.ProcessMiningTasks.LogGenerator.PositionalBased.PositionalBasedLogGenerator import PositionalBasedLogGenerator
from Declare4Py.ProcessMiningTasks.LogGenerator.PositionalBased.PositionalBasedModel import PositionalBasedModel


In [None]:
experiment_id = 'caise2025-orig'

num_traces = 1000
max_ev = 20
min_ev = 20
noise = 10

model_name = "experimental_model_ivan 2"

model_path = f"{model_name}.decl"
export_path = f"output/{experiment_id}_{model_name}_noise_{noise}.csv"

In [None]:
Model: PositionalBasedModel = PositionalBasedModel().parse_from_file(model_path)
generator: PositionalBasedLogGenerator = PositionalBasedLogGenerator(num_traces, min_ev, max_ev, Model, False)

Generate only the positive traces

In [None]:
%%time

generator.run(generate_negatives_traces=False, positive_noise_percentage=noise, negative_noise_percentage=noise)

In [None]:
generator.to_csv(export_path)

In [None]:
generator.get_results_as_dataframe().sample(10)

In [None]:
import itertools
import statistics
from typing import Hashable, Iterable, Sequence

import Levenshtein

def results_to_log(generator: PositionalBasedLogGenerator) -> Iterable[Sequence[Hashable]]:
    for case, frame in generator.get_results_as_dataframe().groupby('case:concept:name'):
        yield frame.sort_values(by='concept:name:order', key=lambda k: k.str.removeprefix('event_').astype('int64'))["concept:name"].values.tolist()

def average_distances(traces: Iterable[Sequence[Hashable]], aggr=statistics.mean) -> tuple[float, float]:
    t1, t2 = itertools.tee(itertools.combinations(traces,2))
    return (aggr(Levenshtein.distance(s1,s2) for s1,s2 in t1)), aggr(Levenshtein.hamming(s1,s2) for s1,s2 in t2)


In [None]:
results_lst = list(results_to_log(generator))
print(f'Number of traces: {len(results_lst)}')
print(f'Average length of traces: {statistics.mean(len(t) for t in results_lst)}')

The average distances are based on the activity names only, payload are ignored (see the code for `results_to_log` function above)

In [None]:
average_distances(results_lst)