In [3]:
base_dirs = [
    'logs/bpic2011',
    'logs/bpic2013',
    'logs/CoSeLoG',
    'logs/Hospital_billings',
    'logs/sepsis',
    'logs/traffic_fines',
]

#### Script to save event log characteristics for analysis

In [4]:
import pandas as pd
import numpy as np
import pm4py
import pickle
from collections import defaultdict

def analyze_event_log(ds):

    filePath = ds + "/log_duration.csv"
    dataframe = pd.read_csv(filePath, sep=';')
    dataframe = pm4py.format_dataframe(dataframe, case_id='Case ID', activity_key='Activity', timestamp_key='Complete Timestamp')


    variants = pm4py.get_variants(dataframe)
    traces = dataframe['Case ID'].value_counts()


    insights = {
        'num_variants': len(variants),
        'total_num_cases': len(traces),
        'min_cases_variant': min(variants.values()),
        'max_cases_variant': max(variants.values()),
        'min_events_case': min(traces),
        'max_events_case': max(traces),
        'avg_events_case': np.mean(traces)
    }

    return insights


all_insights = defaultdict(dict)
for ds in base_dirs:
    dataset_name = ds.split('/')[-1]  
    all_insights[dataset_name] = analyze_event_log(ds)


with open('event_log_characteristics.pickle', 'wb') as f:
    pickle.dump(all_insights, f)


  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col], utc=True)
  df[col] = pd.to_datetime(df[col]

#### Convert the pickle files from algorithm runs to a single pickle file for analysis

In [6]:
import pickle
import os

def load_pickles(root_dir):
    all_pickles = defaultdict(lambda: defaultdict(list))
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.pickle'):
                dataset_name = os.path.basename(root_dir)
                if 'heuristic_pretsa' in file:
                    algorithm = 'pretsa_bf'
                elif 'pretsa_star' in file:
                    algorithm = 'pretsa_star'
                elif 'pretsa' in file:
                    algorithm = 'pretsa'
                else:
                    continue
                all_pickles[dataset_name][algorithm].append(os.path.join(root, file))
    return all_pickles

def create_unified_pickle_file(all_pickle_data, output_filename):
    unified_data = []

    for dataset_name, algorithms in all_pickle_data.items():
        for algorithm, files in algorithms.items():

            for file in files:
                with open(file, 'rb') as f:
                    content = pickle.load(f)
                    parts = file.split('/')
                    setting = parts[-1].split('_')
                    k_value = setting[3]
                    t_value = setting[2]
                    entry = {
                        'eventlog': dataset_name,
                        'algorithm': algorithm,
                        'k_value': k_value,
                        't_value': t_value,
                        'runtimes': content['runtimes'],
                        'number_of_cases': len(content.get('cases', [])),
                        'inflictedChanges': content.get('inflictedChanges', 0),
                        'total_time': sum(content['runtimes'].values())
                    }
                    unified_data.append(entry)

    with open(output_filename, 'wb') as f:
        pickle.dump(unified_data, f)


all_pickle_data = {}
for base_dir in base_dirs:
    all_pickle_data.update(load_pickles(base_dir))


create_unified_pickle_file(all_pickle_data, 'unified_insights.pickle')


#### Script to compare original event log with the PRETSA event logs on charactersitics like Standard Edit Distance (sed), Mean Cycle Error, Inflicted Cases, Cases_nr

In [None]:
import pandas as pd
import os
from calculateSEDBetweenEventLogs import get_sed_between_logs
import statistics

def get_mean_cycle_times(filePath):
    eventLog = pd.read_csv(filePath, delimiter=";")
    mean_cycle_times = eventLog.groupby('Activity').Duration.agg("mean")
    return mean_cycle_times




picks = []
for dir_path in base_dirs:

    filePathOriginalLog = dir_path+"/log_duration.csv"


    event_log_original = pd.read_csv(filePathOriginalLog, delimiter=";")
    distanceMatrix = dict()
    original_cycle_time = get_mean_cycle_times(filePathOriginalLog)

    for k in (4, 8, 16, 32, 64):
        for t in (1, 2, 3, 4, 5):
            for algorithm in ("pretsa", "heuristic_pretsa", "pretsa_star"):
                data = dict()
                als = algorithm if algorithm != "heuristic_pretsa" else "pretsa_bf"
                filePathAlgoLog = dir_path +"/" +als+"/logs/log_duration_t" + str(t) + "_k" + str(k) + "_" + algorithm + ".csv"
                filePathAlgoPickel = dir_path +"/" +als+"/pickels/log_duration_t" + str(t) + "_k" + str(k) + "_" + algorithm + ".pickle"

                if os.path.exists(filePathAlgoPickel):
                    file = open(filePathAlgoPickel, 'rb')
                    pickle_data = pickle.load(file)
                    file.close()
                    data["cases"] = pickle_data["cases"]
                    data["cases_nr"] = len(pickle_data["cases"])
                    data["inflictedChanges"] = pickle_data["inflictedChanges"]
                else:
                    data = dict()
                    data["cases"] = -1
                    data["cases_nr"] = -1
                    data["inflictedChanges"] = -1

                if os.path.exists(filePathAlgoLog):
                    data["sed"] = \
                        get_sed_between_logs(
                            event_log_original, filePathAlgoLog, distanceMatrix)
                    errors = list()
                    log_cycle_times = get_mean_cycle_times(filePathAlgoLog)
                    for activity in original_cycle_time.keys():
                        originalValue = original_cycle_time[activity]
                        if originalValue != 0.0:
                            algorithmValue = log_cycle_times.get(activity,0.0)
                            relativeError = abs((algorithmValue / originalValue) - 1.0)
                            if relativeError > 1:
                                relativeError = 1
                            errors.append(relativeError)
                    data["error"] = statistics.mean(errors)


                else:
                    data["sed"] = -1
                    data["error"] = -1



                data["k"] = k
                data["t"] = t
                data["algorithm"] = algorithm
                data["dataset"] = dir_path.split("/")[-1] 

                picks.append(data)

    with open("algo_output_comparision.pickle", 'wb') as f:
        pickle.dump(picks, f)
