# Evaluation

To inspect the evaulation results, we first need to load all files containing results from the output evaluation directory.\
We load the results as a pandas data frame and clean it. In the second next code block at the end, the resulting data frame can be display by including the commented out statement.

In [1]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from ipywidgets import widgets, interact, interact_manual, Layout, Button, Box
from IPython.display import display

import seaborn as sns
from matplotlib import pyplot as plt

from r2pa.api import routines
from april.fs import EVALUATION_DIR

import os
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def create_dict_from_lists(key_list, value_list):
    dictionary = {}
    for i, key in enumerate(key_list):
        dictionary[key] = value_list[i]
    return dictionary

def create_combined_columns(df, columns):
    df[' '.join(columns)] = df[columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
    

output = widgets.Output()

# load all csv files and convert to data frame
all_csvs = [file for file in os.listdir(EVALUATION_DIR) if file[-4:] == '.csv']

all_dfs = []
for file in all_csvs:
    file_df = pd.read_csv(EVALUATION_DIR / file)
    all_dfs.append(file_df)
    
df = pd.concat(all_dfs)
df = df.drop(columns=['Unnamed: 0'])

# convert to correct data types
non_numeric_columns = ["dataset", "next event predictor", "use cache", 'group attribute nodes']
for column in df.columns:
    if column not in non_numeric_columns:
        df[column] = pd.to_numeric(df[column])

# rename columns
identifiers_columns = ['dataset', 'next event predictor', 'next event threshold']
identifiers_new_columns = ['dataset', 'nep', 'threshold']

df.rename(columns=create_dict_from_lists(identifiers_columns, identifiers_new_columns), inplace=True)

# add new columns
df['total case generation time (seconds)'] = df['case generation time'] + df['cache generation time']
df['total time per case (seconds)'] = df['total case generation time (seconds)'] / df['number of model cases']
df['cases to graph per case (seconds)'] = df['cases to graph conversion time'] / df['number of model cases']

# create combined columns and set index
create_combined_columns(df, ['dataset', 'nep', 'threshold'])

create_combined_columns(df, ['nep', 'threshold'])
identifiers_new_columns.append('nep threshold')

create_combined_columns(df, ['dataset', 'nep'])
identifiers_new_columns.append('dataset nep')

create_combined_columns(df, ['nep', 'use cache'])
identifiers_new_columns.append('nep use cache')

df = df.set_index('dataset nep threshold')

# display data frame
with output:
    with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
        display(df)

display(output)

Output()

## Discovery

After loading the data frame, we can start with the evaluation regarding process discovery evaluation metrics. We select the respective columns and rename them. \
We then plot results for the metrics F1 measure, precision, fitness,..

Note that the generalization metric is only calculated when using a cache in the case generation.
When the cache is not used, the metric is negative.

In [3]:
discovery_output = widgets.Output()

discovery_metrics_columns = ['precision', 'precision average distance', 'fitness', 'fitness average distance', 'f1 measure', 'number of nodes model graph', 'percentage of uncached cases only ground truth cases']
discovery_metrics_new_columns = ['P', 'PAD', 'F', 'FAD', 'F1', '#N', 'G']

df_discovery = df[identifiers_new_columns + discovery_metrics_columns]
df_discovery.rename(columns=create_dict_from_lists(discovery_metrics_columns, discovery_metrics_new_columns), inplace=True)

with discovery_output:
    # f1 measure
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='F1', hue='nep', estimator=max, ci=None, data=df_discovery)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title('Best F1 Measure Scores Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('F1 Measure')
    plt.show()
    plt.close()
    
    # precision
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='P', hue='nep', estimator=max, ci=None, data=df_discovery)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title('Best Precision Scores Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Precision')
    plt.show()
    plt.close()
    
    # fitness
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='F', hue='nep', estimator=max, ci=None, data=df_discovery)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title('Best Fitness Scores Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Fitness')
    plt.show()
    plt.close()
    
    # generalization
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='G', hue='nep', estimator=max, ci=None, data=df_discovery)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title('Best Generalization Scores Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Generalization')
    plt.show()
    plt.close()
    
    # display(df_discovery)
        
display(discovery_output)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Output()

To inspect the result for a certain event log, set the variable in the next code block. The previous code block must be run before this one.

In [4]:
discovery_dataset_output = widgets.Output()

dataset_selection = 'papermanual-0.3-1'
df_discovery_dataset = df_discovery.loc[df_discovery['dataset'] == dataset_selection]

with discovery_dataset_output:
    # f1 measure
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='F1', hue='threshold', estimator=max, ci=None, data=df_discovery_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title(f'Best F1 Measure Scores For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('F1 Measure')
    plt.show()
    plt.close()
    
    # precision
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='P', hue='threshold', estimator=max, ci=None, data=df_discovery_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title(f'Best Precision Scores For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('Precision')
    plt.show()
    plt.close()
    
    # fitness
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='F', hue='threshold', estimator=max, ci=None, data=df_discovery_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title='NEP')
    plt.ylim(0, 1)
    plt.title(f'Best Fitness Scores For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('Fitness')
    plt.show()
    plt.close()
    
    # generalization
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='G', hue='threshold', estimator=max, ci=None, data=df_discovery_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.ylim(0, 1)
    plt.title(f'Best Generalization Scores For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('Generalization')
    plt.show()
    plt.close()
    
display(discovery_dataset_output)

Output()

## Likelihoods
Next, we take a look at how accurate the learned likelihoods are. We inspect the average deviation from the correct likelihoods for an event and for cases.

In [5]:
likelihood_output = widgets.Output()

likelihood_metrics_columns = ['absolute likelihood difference per case and attribute', 'mean squared error likelihoods', 'likelihood difference per case', 'normalized likelihood difference per case', ]
likelihood_metrics_new_columns = ['ALDE', 'MSE', 'ALDC', 'NALDC']

df_likelihoods = df[identifiers_new_columns + likelihood_metrics_columns]
df_likelihoods.rename(columns=create_dict_from_lists(likelihood_metrics_columns, likelihood_metrics_new_columns), inplace=True)

with likelihood_output:
    # ALDE
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='ALDE', hue='nep', estimator=min, ci=None, data=df_likelihoods)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Best ALDE Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('ALDE')
    plt.show()
    plt.close()
    
    # MSE
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='MSE', hue='nep', estimator=min, ci=None, data=df_likelihoods)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Best MSE Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('MSE')
    plt.show()
    plt.close()
    
    # ALDC
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='ALDC', hue='nep', estimator=min, ci=None, data=df_likelihoods)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Best ALDC Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('ALDC')
    plt.show()
    plt.close()
    
    # NALDC
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='NALDC', hue='nep', estimator=min, ci=None, data=df_likelihoods)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Best NALDC Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('NALDC')
    plt.show()
    plt.close()
    
    # display(df_likelihoods)
    
display(likelihood_output)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Output()

To inspect the result for a certain event log, set the variable in the next code block. The previous code block must be run before this one.

In [6]:
discovery_likelihoods_output = widgets.Output()

dataset_selection = 'papermanual-0.3-1'
df_likelihoods_dataset = df_likelihoods.loc[df_likelihoods['dataset'] == dataset_selection]

with discovery_likelihoods_output:
    # ALDE
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='ALDE', hue='threshold', estimator=max, ci=None, data=df_likelihoods_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title(f'Best ALDE For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('ALDE')
    plt.show()
    plt.close()
    
    # MSE
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='MSE', hue='threshold', estimator=max, ci=None, data=df_likelihoods_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title(f'Best MSE For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('MSE')
    plt.show()
    plt.close()
    
    # ALDC
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='ALDC', hue='threshold', estimator=max, ci=None, data=df_likelihoods_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title(f'Best ALDC For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('ALDC')
    plt.show()
    plt.close()
    
    # NALDC
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='nep', y='NALDC', hue='threshold', estimator=max, ci=None, data=df_likelihoods_dataset)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title(f'Best NALDC For {dataset_selection}')
    plt.xlabel('NEP')
    plt.ylabel('NALDC')
    plt.show()
    plt.close()
    
display(discovery_likelihoods_output)

Output()

## Timings

Finally, we can inspect the run time required to discover the process models.
Due to possibly generating different numbers of cases,\
in which the next event threshold plays a large role, we mostly regard the time taken per case generated.\
We also plot the differences in run time when using a cache.

In [8]:
timings_output = widgets.Output()

timings_metrics_columns = ['use cache', 'total case generation time (seconds)', 'total time per case (seconds)', 'cases to graph conversion time', 'cache generation time', 
                           'case generation time', 'cases to graph per case (seconds)']
timings_metrics_new_columns = ['use cache', 'total case generation', 'total time per case', 'graph creation', 'cache generation', 'case generation', 'graph creation per case']

df_timings = df[identifiers_new_columns + timings_metrics_columns]
df_timings.rename(columns=create_dict_from_lists(timings_metrics_columns, timings_metrics_new_columns), inplace=True)

with timings_output:
    # total case generation
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='total case generation', hue='nep', data=df_timings)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Case Generation Time Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Time (Seconds)')
    plt.show()
    plt.close()
    
    # total time per case 
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='total time per case', hue='nep', data=df_timings)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Case Generation Time Per Case Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Time Per Case (Seconds)')
    plt.show()
    plt.close()
    
    # total time per case 
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='graph creation per case', hue='nep', data=df_timings)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP")
    plt.title('Graph Creation Time Per Case Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Time Per Case (Seconds)')
    plt.show()
    plt.close()
    
    # total case generation use cache
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='total case generation', hue='nep use cache', data=df_timings)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP, Use Cache")
    plt.title('Case Generation Time Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Time (Seconds)')
    plt.show()
    plt.close()
    
    # total case generation per case use cache
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.barplot(x='dataset', y='total time per case', hue='nep use cache', data=df_timings)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title="NEP, Use Cache")
    plt.title('Case Generation Time Per Case Across Event Logs')
    plt.xlabel('Event Log')
    plt.ylabel('Time (Seconds)')
    plt.show()
    plt.close()
    
    # display(df_timings)
        
display(timings_output)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Output()

Next, we plot the case generation time per case vs. the F1 measure. Therefore we can see which model performs best when considering run time.\

Note that the limits of the plot might need to be adjusted for other data.

In [10]:
scatter_output = widgets.Output()

df_scatter = df[identifiers_new_columns + discovery_metrics_columns + timings_metrics_columns]
df_scatter.rename(columns=create_dict_from_lists(discovery_metrics_columns, discovery_metrics_new_columns), inplace=True)
df_scatter.rename(columns=create_dict_from_lists(timings_metrics_columns, timings_metrics_new_columns), inplace=True)

with scatter_output:
    plt.figure(figsize=(20, 8))
    sns.set_theme(style="whitegrid")
    sns.scatterplot(data=df_scatter, x='total time per case', y='F1', hue='nep', style='dataset')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title('Case Generation Time Per Case Across Event Logs')
    plt.xlabel('Generation Time Per Case (Seconds)')
    plt.ylabel('F1-Measure')
    plt.ylim(0.5, 1)
    plt.xlim(0, 0.2) # TODO:
    plt.show()
    plt.close()
    
    # display(df_scatter)
    
display(scatter_output)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Output()