In [42]:
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm

from april import Dataset
from april.processmining import ProcessMap
from april.fs import get_event_log_files
from april.fs import get_process_model_files

# Event Log information

A list of all event logs used in the evaluation

In [43]:
logs = sorted([e.name for e in get_event_log_files() if e.p == 0.3])
columns = ['name', 'base_name', 'num_cases', 'num_events', 'num_activities', 
           'num_attributes', 'attribute_keys', 'attribute_dims', 
           'min_attribute_dim', 'max_attribute_dim',
           'min_case_len', 'max_case_len', 'mean_case_len']
df = []
for log in tqdm(logs):
    d = Dataset(log)
    dim_min = d.attribute_dims[1:].astype(int).min() if d.attribute_dims[1:].size else None
    dim_max = d.attribute_dims[1:].astype(int).max() if d.attribute_dims[1:].size else None
    df.append([log, log.split('-')[0], d.num_cases, d.num_events, d.attribute_dims[0].astype(int), 
               d.num_attributes - 1, d.attribute_keys[1:], d.attribute_dims[1:].astype(int), dim_min, dim_max,
               d.case_lens.min(), d.case_lens.max(), d.case_lens.mean().round(2)])
event_logs = pd.DataFrame(df, columns=columns)

100%|██████████| 44/44 [00:03<00:00, 11.93it/s]


## Basis for Table 1 in the Paper

In [44]:
event_logs

Unnamed: 0,name,base_name,num_cases,num_events,num_activities,num_attributes,attribute_keys,attribute_dims,min_attribute_dim,max_attribute_dim,min_case_len,max_case_len,mean_case_len
0,bpic12-0.3-1,bpic12,13087,290424,73,0,[],[],,,3,175,22.19
1,bpic13-0.3-1,bpic13,1487,9793,15,4,"[org_group, org_resource, organization_country...","[31, 657, 35, 43]",31.0,657.0,3,37,6.59
2,bpic13-0.3-2,bpic13,7554,81427,27,4,"[org_group, org_resource, organization_country...","[987, 1836, 47, 65]",47.0,1836.0,3,125,10.78
3,bpic13-0.3-3,bpic13,819,4043,11,2,"[org_group, org_resource]","[23, 267]",23.0,267.0,3,24,4.94
4,bpic15-0.3-1,bpic15,1199,54770,466,2,"[monitoringResource, org_resource]","[53, 47]",47.0,53.0,4,104,45.68
5,bpic15-0.3-2,bpic15,832,46089,474,2,"[monitoringResource, org_resource]","[19, 23]",19.0,23.0,3,134,55.4
6,bpic15-0.3-3,bpic15,1409,62665,486,3,"[action_code, monitoringResource, org_resource]","[481, 45, 29]",29.0,481.0,5,126,44.47
7,bpic15-0.3-4,bpic15,1053,49493,422,2,"[monitoringResource, org_resource]","[25, 21]",21.0,25.0,3,118,47.0
8,bpic15-0.3-5,bpic15,1156,61465,446,2,"[monitoringResource, org_resource]","[33, 43]",33.0,43.0,7,156,53.17
9,bpic17-0.3-1,bpic17,31509,1268704,53,1,[org_resource],[299],299.0,299.0,10,182,40.26


In [45]:
event_logs[['base_name', 'num_activities', 'num_cases', 'num_events', 'min_attribute_dim', 'max_attribute_dim']].groupby('base_name').agg(['count', 'min', 'max'])

Unnamed: 0_level_0,num_activities,num_activities,num_activities,num_cases,num_cases,num_cases,num_events,num_events,num_events,min_attribute_dim,min_attribute_dim,min_attribute_dim,max_attribute_dim,max_attribute_dim,max_attribute_dim
Unnamed: 0_level_1,count,min,max,count,min,max,count,min,max,count,min,max,count,min,max
base_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
bpic12,1,73,73,1,13087,13087,1,290424,290424,0,,,0,,
bpic13,3,11,27,3,819,7554,3,4043,81427,3,23.0,47.0,3,267.0,1836.0
bpic15,5,422,486,5,832,1409,5,46089,62665,5,19.0,47.0,5,23.0,481.0
bpic17,2,17,53,2,31509,42995,2,284664,1268704,2,289.0,299.0,2,289.0,299.0
gigantic,4,154,157,4,5000,5000,4,38744,42259,4,13.0,141.0,4,140.0,409.0
huge,4,109,109,4,5000,5000,4,46944,53614,4,13.0,141.0,4,141.0,420.0
large,4,85,85,4,5000,5000,4,61716,67487,4,13.0,141.0,4,141.0,398.0
medium,4,65,65,4,5000,5000,4,39071,42013,4,13.0,140.0,4,140.0,398.0
p2p,4,27,27,4,5000,5000,4,48484,53174,4,13.0,141.0,4,141.0,386.0
paper,1,27,27,1,5000,5000,1,66814,66814,1,13.0,13.0,1,13.0,13.0


# Process Model Information

In [64]:
maps = sorted([m for m in get_process_model_files()])
df = []
for process_map in tqdm(maps):
    model = ProcessMap.from_plg(process_map)

    num_variants = len(model.variants.cases)
    max_case_len = model.variants.max_case_len

    nodes = model.graph.number_of_nodes()
    edges = model.graph.number_of_edges()
    dens = nx.density(model.graph)
    in_degree = np.mean([d[1] for d in model.graph.in_degree()])
    out_degree = np.mean([d[1] for d in model.graph.out_degree()])

    df.append([nodes, edges, num_variants, max_case_len, dens, in_degree, out_degree])
process_models = pd.DataFrame(df, index=maps, columns=['nodes', 'edges', 'num_variants', 'max_case_len', 'density', 'in_deg', 'out_deg'])

100%|██████████| 9/9 [00:00<00:00, 45.06it/s]


In [66]:
process_models.loc[['paper', 'p2p', 'small', 'medium', 'large', 'huge', 'gigantic', 'wide', 'testing']].round(2)

Unnamed: 0,nodes,edges,num_variants,max_case_len,density,in_deg,out_deg
paper,16,18,8,12,0.08,1.12,1.12
p2p,15,18,8,11,0.09,1.2,1.2
small,22,26,6,10,0.06,1.18,1.18
medium,34,48,25,8,0.04,1.41,1.41
large,44,56,28,12,0.03,1.27,1.27
huge,56,75,39,11,0.02,1.34,1.34
gigantic,80,119,71,11,0.02,1.49,1.49
wide,36,53,19,7,0.04,1.47,1.47
testing,129,182,63,12,0.01,1.41,1.41
