In [1]:
import pandas as pd
import numpy as np
import pm4py
from pm4py.objects.log.util import sorting
from scipy.stats import wasserstein_distance
from sklearn import preprocessing

from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance

import warnings
warnings.filterwarnings("ignore")

In [2]:
import re
def extract_first_float(cell):
    if isinstance(cell, str):
        # Use regular expression to extract the first float and the value in brackets
        match = re.match(r'(\d+\.\d+)(?: \((\d+\.\d+)\))?', cell)
        if match:
            return float(match.group(1)), (match.group(2)) if match.group(2) else ''
        else:
            return float('inf'), ''
    else:
        return cell, ''

In [3]:
def highlight_min_max(s):
    """
    Highlight the minimum value in green and the maximum value in red for each column.
    """
    is_min = s == s.min()
    is_max = s == s.max()
    min_max_style = ['background-color: green' if v else '' for v in is_min]
    for i, v in enumerate(is_max):
        if v:
            min_max_style[i] = 'background-color: red'
    return min_max_style

In [4]:
def align_column_names(df):
    if 'case:concept:name' in df.columns:
        df = df.rename(columns={'case:concept:name': 'case_id'})
    elif 'caseid' in df.columns:
        df = df.rename(columns={'caseid': 'case_id'})
    if 'Activity' in df.columns:
        df = df.rename(columns={'Activity': 'activity'})
    elif 'activity_name' in df.columns:
        df = df.rename(columns={'activity_name': 'activity'})
    elif 'task' in df.columns:
        df = df.rename(columns={'task': 'activity'})
    elif 'concept:name' in df.columns:
        df = df.rename(columns={'concept:name': 'activity'})
    if 'Resource' in df.columns:
        df = df.rename(columns={'Resource': 'resource'})
    elif 'user' in df.columns:
        df = df.rename(columns={'user': 'resource'})
    elif 'agent' in df.columns:
        if 'resource' in df.columns:
            df = df.drop(['resource'], axis=1)
        df = df.rename(columns={'agent': 'resource'})
    elif 'org:resource' in df.columns:
        df = df.rename(columns={'org:resource': 'resource'})
    if 'start_timestamp' in df.columns:
        df = df.rename(columns={'start_timestamp': 'start_time'})
    if 'end_timestamp' in df.columns:
        df = df.rename(columns={'end_timestamp': 'end_time'})
    # for SIMOD simulated logs
    if 'start_time' in df.columns:
        df = df.rename(columns={'start_time': 'start_time'})
    if 'end_time' in df.columns:
        df = df.rename(columns={'end_time': 'end_time'})
    if 'start:timestamp' in df.columns:
        df = df.rename(columns={'start:timestamp': 'start_time'})
    if 'time:timestamp' in df.columns:
        df = df.rename(columns={'time:timestamp': 'end_time'})
    return df

In [10]:
def main_(log_paths, name_experiments):
    def perform_evauluation(all_metrics, PATH_SIMULATED_LOG, test_log):
        for i in range(10):
            # print(f"Evaluate simulation {i}")
            path_simulated_file = PATH_SIMULATED_LOG + '/simulated_log_' + str(i) + '.csv'
            # read simulated log and align column names
            simulated_log = pd.read_csv(path_simulated_file)
            simulated_log = align_column_names(simulated_log)
            # print(simulated_log)
            # print("########")
            # print(simulated_log[event_log_ids.activity].unique())
            simulated_log[event_log_ids.start_time] = pd.to_datetime(simulated_log[event_log_ids.start_time], utc=True, format='mixed')
            simulated_log[event_log_ids.end_time] = pd.to_datetime(simulated_log[event_log_ids.end_time], utc=True, format='mixed')

            # Call passing the event logs, and its column ID mappings
            ngd = n_gram_distribution_distance(test_log, event_log_ids, simulated_log, event_log_ids, n=3)
            all_metrics['NGD'].append(ngd)

            # Call passing the event logs, its column ID mappings, timestamp type, and discretize function
            aedd = absolute_event_distribution_distance(
                test_log, event_log_ids,  # First event log and its column id mappings
                simulated_log, event_log_ids,  # Second event log and its column id mappings
                discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
                discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
            )
            all_metrics['AEDD'].append(aedd)

            cadd = case_arrival_distribution_distance(
                test_log, event_log_ids,  # First event log and its column id mappings
                simulated_log, event_log_ids,  # Second event log and its column id mappings
                discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
            )
            all_metrics['CADD'].append(cadd)

            cedd = circadian_event_distribution_distance(
                test_log, event_log_ids,  # First event log and its column id mappings
                simulated_log, event_log_ids,  # Second event log and its column id mappings
                discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
            )
            all_metrics['CEDD'].append(cedd)

            redd = relative_event_distribution_distance(
                test_log, event_log_ids,  # First event log and its column id mappings
                simulated_log, event_log_ids,  # Second event log and its column id mappings
                discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
                discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
            )
            all_metrics['REDD'].append(redd)


            ctdd = cycle_time_distribution_distance(
                test_log, event_log_ids,  # First event log and its column id mappings
                simulated_log, event_log_ids,  # Second event log and its column id mappings
                bin_size=pd.Timedelta(hours=1)  # Bins of 1 hour
            )
            all_metrics['CTDD'].append(ctdd)

        return all_metrics
    
    number_evaluations = len(log_paths)

    # Set event log column ID mapping
    event_log_ids = EventLogIDs(  # These values are stored in DEFAULT_CSV_IDS
        case="case_id",
        activity="activity",
        start_time="start_time",
        end_time="end_time",
        resource='resource'
    )

    index_names = name_experiments
    results_df = pd.DataFrame(index=index_names)
    mean_results = pd.DataFrame(index=index_names)

    for experiment in range(number_evaluations):
        # Read and transform time attributes
        test_log = pd.read_csv(log_paths[experiment][0])
        test_log = align_column_names(test_log)
        test_log[event_log_ids.start_time] = pd.to_datetime(test_log[event_log_ids.start_time], utc=True, format='mixed')
        test_log[event_log_ids.end_time] = pd.to_datetime(test_log[event_log_ids.end_time], utc=True, format='mixed')

        PATH_SIMULATED_LOG = log_paths[experiment][1]

        all_metrics = {
            'NGD': [],
            'AEDD': [],
            'CADD': [],
            'CEDD': [],
            'REDD': [],
            'CTDD': [],
        }

        all_metrics = perform_evauluation(all_metrics, PATH_SIMULATED_LOG, test_log)

        mean_results.loc[index_names[experiment], 'N-Gram Distribution Distance'] = round(np.mean(all_metrics['NGD']), 3)
        mean_results.loc[index_names[experiment], 'Absolute Event Distribution Distance'] = round(np.mean(all_metrics['AEDD']), 3)
        mean_results.loc[index_names[experiment], 'Case Arrival Distribution Distance'] = round(np.mean(all_metrics['CADD']), 3)
        mean_results.loc[index_names[experiment], 'Circadian Event Distribution Distance'] = round(np.mean(all_metrics['CEDD']), 3)
        mean_results.loc[index_names[experiment], 'Relative Event Distribution Distance'] = round(np.mean(all_metrics['REDD']), 3)
        mean_results.loc[index_names[experiment], 'Cycle Time Distribution Distance'] = round(np.mean(all_metrics['CTDD']), 3)


    

        results_df.loc[index_names[experiment], 'N-Gram Distribution Distance'] = f"{round(np.mean(all_metrics['NGD']), 3)} ({round(np.std(all_metrics['NGD']), 3)})"
        results_df.loc[index_names[experiment], 'Absolute Event Distribution Distance'] = f"{round(np.mean(all_metrics['AEDD']), 3)} ({round(np.std(all_metrics['AEDD']), 3)})"
        results_df.loc[index_names[experiment], 'Case Arrival Distribution Distance'] = f"{round(np.mean(all_metrics['CADD']), 3)} ({round(np.std(all_metrics['CADD']), 3)})"
        results_df.loc[index_names[experiment], 'Circadian Event Distribution Distance'] = f"{round(np.mean(all_metrics['CEDD']), 3)} ({round(np.std(all_metrics['CEDD']), 3)})"
        results_df.loc[index_names[experiment], 'Relative Event Distribution Distance'] = f"{round(np.mean(all_metrics['REDD']), 3)} ({round(np.std(all_metrics['REDD']), 3)})"
        results_df.loc[index_names[experiment], 'Cycle Time Distribution Distance'] = f"{round(np.mean(all_metrics['CTDD']), 3)} ({round(np.std(all_metrics['CTDD']), 3)})"

    return mean_results, results_df


# Loan Application

In [23]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/Loan_Application_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/Loan_Application_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/Loan_Application_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/Loan_Application_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/Loan_Application/tst_LoanApp.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/Loan_Application'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/Loan_Application/tst_SynLoan.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/Loan_Application'

PATH_TEST_LOG_MAS = '../simulated_data/LoanApp.csv/no_delays/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/LoanApp.csv/no_delays'

PATH_TEST_LOG_MAS_extr = '../simulated_data/LoanApp.csv/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/LoanApp.csv/delays_decentral/'

PATH_TEST_LOG_MAS_central = '../simulated_data/LoanApp.csv/no_delays_central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_central = '../simulated_data/LoanApp.csv/no_delays_central/'


log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
             [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM],
                [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS], [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr],
                [PATH_TEST_LOG_MAS_central, PATH_SIMULATED_LOG_MAS_central]]

name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_central']

In [24]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.21,12.344,15.797,0.418,12.865,33.997
SIMOD_extr,0.15,13.551,17.393,0.402,9.228,20.425
DGEN,0.219,212.278,206.7,13.403,5.262,9.385
DSIM,0.557,428.562,326.035,0.376,109.816,467.776
MAS,0.077,5.231,6.7,0.206,1.314,1.584
MAS_extr,0.074,2.936,0.004,0.199,1.662,1.749
MAS_central,0.075,2.761,0.0,0.219,1.358,1.503


## P2P

In [35]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/P2P_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/P2P_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/P2P_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/P2P_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/P2P/tst_PurchasingExample.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/P2P'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/PurchasingExample/tst_PurchasingExample.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/PurchasingExample'

PATH_TEST_LOG_MAS = '../simulated_data/P2P/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/P2P/decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/P2P/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/P2P/delays_decentral/'

PATH_TEST_LOG_MAS_extr_central = '../simulated_data/P2P/delays_central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr_central = '../simulated_data/P2P/delays_central/'


log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
                [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], 
             [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS], [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], 
             [PATH_TEST_LOG_MAS_extr_central, PATH_SIMULATED_LOG_MAS_extr_central], ]

name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_extr_central']

In [36]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.415,1044.251,563.892,2.213,840.194,677.053
SIMOD_extr,0.415,867.561,556.312,1.674,682.133,571.583
DGEN,0.2,1481.46,1007.446,2.558,828.093,670.053
DSIM,0.222,1310.037,927.584,1.157,722.334,566.637
MAS,0.237,1272.434,857.309,4.799,766.139,546.851
MAS_extr,0.252,1122.419,811.832,0.976,668.205,529.084
MAS_extr_central,0.249,1098.16,779.446,1.065,674.828,528.991


## ACR = ConsultaDataMining

In [33]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/ConsultaDataMining_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/ConsultaDataMining_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/ConsultaDataMining_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/ConsultaDataMining_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/ConsultaDataMining/tst_ConsultaDataMining201618.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/ConsultaDataMining'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/ConsultaDataMining201618/tst_ConsultaDataMining201618.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/ConsultaDataMining201618'

PATH_TEST_LOG_MAS = '../simulated_data/ConsultaDataMining/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/ConsultaDataMining/decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/ConsultaDataMining/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/ConsultaDataMining/delays_decentral'

PATH_TEST_LOG_MAS_central = '../simulated_data/ConsultaDataMining/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_central = '../simulated_data/ConsultaDataMining/central'


log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], 
    [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS], [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_central, PATH_SIMULATED_LOG_MAS_central]]

name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_central']

In [34]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.239,287.279,252.235,2.602,32.461,93.512
SIMOD_extr,0.501,107.697,164.108,3.156,94.391,277.454
DGEN,0.314,559.675,527.891,17.84,30.875,95.113
DSIM,0.26,273.468,262.327,4.644,15.621,48.246
MAS,0.524,305.226,282.523,6.953,26.046,75.649
MAS_extr,0.394,237.275,361.273,5.702,168.656,407.11
MAS_central,0.367,281.576,254.927,7.212,27.328,77.26


## Production

In [39]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/Production/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/Production/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/Production_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/Production_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/Productions/tst_Productions.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/Productions'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/Production/tst_Production.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/Production'

PATH_TEST_LOG_MAS = '../simulated_data/Production/no_delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/Production/no_delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/Production/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/Production/decentral'

PATH_TEST_LOG_MAS_extr_central = '../simulated_data/Production/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr_central = '../simulated_data/Production/central'

PATH_TEST_LOG_MAS_central = '../simulated_data/Production/no_delays_central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_central = '../simulated_data/Production/no_delays_central'

log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], 
    [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS], [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_central, PATH_SIMULATED_LOG_MAS_central], 
    [PATH_TEST_LOG_MAS_extr_central,PATH_SIMULATED_LOG_MAS_extr_central] ]
name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_central', 'MAS_extr_central']

In [40]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.928,146.381,166.895,2.82,83.88,89.15
SIMOD_extr,0.926,309.04,177.2,3.236,78.786,212.898
DGEN,0.528,224.455,198.333,9.3,70.113,90.82
DSIM,0.868,154.319,165.609,2.669,33.308,43.267
MAS,0.774,70.358,79.034,5.97,22.491,40.671
MAS_extr,0.745,148.332,78.292,3.131,193.25,237.108
MAS_central,0.615,61.803,80.616,5.688,17.773,23.45
MAS_extr_central,0.628,54.918,92.203,5.778,13.309,33.795


## BPIC 2012 W

In [110]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/BPIC_2012W_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/BPIC_2012W_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/BPIC_2012W_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/BPIC_2012W_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/BPIC_2012W/tst_BPI_Challenge_2012_W_Two_TS.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/BPIC_2012W'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/BPI_Challenge_2012_W_Two_TS/tst_BPI_Challenge_2012_W_Two_TS.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/BPI_Challenge_2012_W_Two_TS'

PATH_TEST_LOG_MAS = '../simulated_data/BPIC_2012_W/no_delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/BPIC_2012_W/no_delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/BPIC_2012_W/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/BPIC_2012_W/decentral'

PATH_TEST_LOG_MAS_extr_central = '../simulated_data/BPIC_2012_W/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr_central = '../simulated_data/BPIC_2012_W/central'

log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], 
    [PATH_TEST_LOG_MAS_extr_central, PATH_SIMULATED_LOG_MAS_extr_central]]

name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_extr_central']

In [111]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.633,120.262,17.013,1.876,140.684,194.671
SIMOD_extr,0.72,71.977,18.128,1.716,95.72,155.467
DGEN,0.435,306.28,199.315,4.534,116.188,176.794
DSIM,0.655,78.625,32.128,2.887,119.126,173.491
MAS,0.322,82.375,50.508,2.163,139.595,194.403
MAS_extr,0.213,92.01,44.503,1.915,52.054,96.909
MAS_extr_central,0.151,81.523,35.903,1.893,49.609,92.993


## CVS Pharmacy

In [23]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/cvs_pharmacy_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/cvs_pharmacy_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/cvs_pharmacy_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/cvs_pharmacy_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/CVS/tst_cvs_pharmacy.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/CVS'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/CVS/tst_cvs_pharmacy.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/CVS'

PATH_TEST_LOG_MAS = '../simulated_data/cvs_pharmacy/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/cvs_pharmacy/delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/cvs_pharmacy/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/cvs_pharmacy/decentral'

PATH_TEST_LOG_MAS_extr_central = '../simulated_data/cvs_pharmacy/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr_central = '../simulated_data/cvs_pharmacy/central'


log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_extr_central, PATH_SIMULATED_LOG_MAS_extr_central] ]
name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_extr_central']

In [24]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.44,52.947,7.709,0.444,39.432,54.594
SIMOD_extr,0.465,45.774,6.432,1.057,37.557,90.999
DGEN,0.219,310.394,133.461,11.699,176.652,294.214
DSIM,0.201,36.237,20.367,8.982,19.743,52.43
MAS,0.086,373.835,205.725,0.395,168.234,273.227
MAS_extr,0.119,91.816,2.635,7.475,87.738,109.702
MAS_extr_central,0.118,94.986,5.122,7.46,88.657,111.07


## BPIC 2017

In [45]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/BPIC_2017_W_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/BPIC_2017_W_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/BPIC_2017_W_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/BPIC_2017_W_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/BPIC_2017W/test_log.csv' 
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/BPIC_2017W'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/BPI_Challenge_2017_W_Two_TS/tst_BPI_Challenge_2017_W_Two_TS.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/BPI_Challenge_2017_W_Two_TS'

PATH_TEST_LOG_MAS = '../simulated_data/BPIC_2017_W/no_delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/BPIC_2017_W/no_delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/BPIC_2017_W/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/BPIC_2017_W/decentral'

PATH_TEST_LOG_MAS_extr_central = '../simulated_data/BPIC_2017_W/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr_central = '../simulated_data/BPIC_2017_W/central'



log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], 
    [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_extr_central,PATH_SIMULATED_LOG_MAS_extr_central],
    ]
name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_extr_central']

In [46]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.591,385.129,194.537,2.258,208.852,274.698
SIMOD_extr,0.591,300.28,182.133,3.342,136.63,148.403
DGEN,0.671,4557.193,4594.98,3.396,118.848,172.94
DSIM,0.536,54.613,53.989,3.346,33.106,30.266
MAS,0.194,430.907,240.228,1.525,209.105,275.302
MAS_extr,0.352,380.512,203.818,1.677,400.113,34.997
MAS_extr_central,0.193,221.493,193.537,1.787,50.01,54.819


## Confidential 1000

In [41]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/Confidential_1000_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/Confidential_1000_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_1000_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_1000_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/Confidential_1000/tst_confidential_1000.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/Confidential_1000'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/confidential_1000/tst_confidential_1000.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/confidential_1000'

PATH_TEST_LOG_MAS = '../simulated_data/Confidential_1000/decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/Confidential_1000/decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/Confidential_1000/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/Confidential_1000/delays_decentral'

PATH_TEST_LOG_MAS_central = '../simulated_data/Confidential_1000/central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_central = '../simulated_data/Confidential_1000/central'



log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_central,PATH_SIMULATED_LOG_MAS_central],
    ]
name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_central']

In [42]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.247,344.486,119.326,3.01,468.818,804.074
SIMOD_extr,0.243,351.939,119.448,2.725,476.036,813.441
DGEN,0.581,462.847,452.122,18.934,8.114,13.929
DSIM,0.203,246.41,239.415,2.28,5.342,7.297
MAS,0.266,166.738,171.04,1.702,6.939,9.494
MAS_extr,0.188,123.083,207.147,1.937,85.425,139.547
MAS_central,0.255,135.703,145.928,1.66,13.221,23.155


## Confidential 2000

In [57]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/Confidential_2000_simod/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/Confidential_2000_simod/best_result/evaluation'

PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_2000_extraneous/best_result/evaluation/test_log.csv'
PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_2000_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/Confidential_2000/tst_confidential_2000.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/Confidential_2000'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/confidential_2000/tst_confidential_2000.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/confidential_2000'

PATH_TEST_LOG_MAS = '../simulated_data/Confidential_2000/no_delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/Confidential_2000/no_delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/Confidential_2000/delays/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/Confidential_2000/delays'

PATH_TEST_LOG_MAS_central = '../simulated_data/Confidential_2000/no_delays_central/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_central = '../simulated_data/Confidential_2000/no_delays_central'



log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], [PATH_TEST_LOG_SIMOD_extr, PATH_SIMULATED_LOG_SIMOD_extr], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], [PATH_TEST_LOG_MAS_central,PATH_SIMULATED_LOG_MAS_central],
    ]
name_experiments = ['SIMOD', 'SIMOD_extr', 'DGEN', 'DSIM', 'MAS', 'MAS_extr', 'MAS_central']

In [58]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.246,820.454,148.235,2.968,952.378,1614.918
SIMOD_extr,0.244,816.741,184.304,3.116,982.095,1662.365
DGEN,0.161,857.683,876.657,18.092,4.588,8.12
DSIM,0.186,591.136,613.292,2.845,1.701,2.262
MAS,0.261,263.253,291.694,1.276,9.192,17.311
MAS_extr,0.192,130.789,176.112,1.657,30.924,48.224
MAS_central,0.253,236.912,266.964,1.49,9.302,17.729


# SynLoan

In [39]:
PATH_TEST_LOG_SIMOD = '../simulated_data/simod/SynLoan/test_log.csv'
PATH_SIMULATED_LOG_SIMOD = '../simulated_data/simod/SynLoan'

# PATH_TEST_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_2000_extraneous/best_result/evaluation/test_log.csv'
# PATH_SIMULATED_LOG_SIMOD_extr = '../simulated_data/simod/Confidential_2000_extraneous/best_result/evaluation'

PATH_TEST_LOG_DGEN = '../simulated_data/deep_generator/SynLoan/test_log.csv'
PATH_SIMULATED_LOG_DGEN = '../simulated_data/deep_generator/SynLoan'

PATH_TEST_LOG_DSIM = '../simulated_data/deep_simulator/SynLoan/test_log.csv'
PATH_SIMULATED_LOG_DSIM = '../simulated_data/deep_simulator/SynLoan'

PATH_TEST_LOG_MAS = '../simulated_data/SynLoan/no_delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS = '../simulated_data/SynLoan/no_delays_decentral'

PATH_TEST_LOG_MAS_extr = '../simulated_data/SynLoan/delays_decentral/test_preprocessed.csv'
PATH_SIMULATED_LOG_MAS_extr = '../simulated_data/SynLoan/delays_decentral'

# PATH_TEST_LOG_MAS_central = '../simulated_data/SynLoan/no_delays_central/test_preprocessed.csv'
# PATH_SIMULATED_LOG_MAS_central = '../simulated_data/SynLoan/no_delays_central'



log_paths = [[PATH_TEST_LOG_SIMOD, PATH_SIMULATED_LOG_SIMOD], 
    [PATH_TEST_LOG_DGEN, PATH_SIMULATED_LOG_DGEN], [PATH_TEST_LOG_DSIM, PATH_SIMULATED_LOG_DSIM], 
    [PATH_TEST_LOG_MAS, PATH_SIMULATED_LOG_MAS],
    [PATH_TEST_LOG_MAS_extr, PATH_SIMULATED_LOG_MAS_extr], 
    # [PATH_TEST_LOG_MAS_central,PATH_SIMULATED_LOG_MAS_central],
    ]
name_experiments = ['SIMOD', 'DGEN', 'DSIM','MAS', 'MAS_extr']#, 'MAS_central']

In [40]:
mean_results, results_df = main_(log_paths, name_experiments)
styled_df = mean_results.style.apply(highlight_min_max)
styled_df

Unnamed: 0,N-Gram Distribution Distance,Absolute Event Distribution Distance,Case Arrival Distribution Distance,Circadian Event Distribution Distance,Relative Event Distribution Distance,Cycle Time Distribution Distance
SIMOD,0.7,869.651,344.959,0.877,513.486,637.68
DGEN,0.396,915.207,554.565,3.716,349.326,402.954
DSIM,0.557,428.562,326.035,0.376,109.816,467.776
MAS,0.104,448.929,6.601,0.483,453.119,586.861
MAS_extr,0.102,448.215,3.073,0.509,451.922,584.421
