In [2]:
import pandas as pd
import numpy as np
import pm4py

# Question 1

 <font size="6">1b</font>

In [3]:
# Import the log
log = pm4py.read_xes("loanApplications.xes.gz")

parsing log, completed traces ::   0%|          | 0/29067 [00:00<?, ?it/s]

In [8]:
num_traces = len(log) # Number of traces in the log
# Cases of successful applications
log_successful = pm4py.filter_event_attribute_values(log, "concept:name", ["A_Pending"], level="case", retain=True)
num_traces_successful = len(log_successful)
print(f"Ratio of sucessful applications: {num_traces_successful/num_traces*100}%")

Ratio of sucessful applications: 54.74249148518939%


In [10]:
# Cases of unsuccessful applications
log_unsuccessful = pm4py.filter_event_attribute_values(log, "concept:name", ["A_Denied", "A_Cancelled"], level="case", retain=True)
num_traces_unsuccessful = len(log_unsuccessful)
print(f"Ratio of unsucessful applications: {num_traces_unsuccessful/num_traces*100}%")

Ratio of unsucessful applications: 44.944438710565244%


In [55]:
# Helper function to inspect attribute values 
def inspect_attribute(log, log_successful, attribute):
    columns = [attribute, "num_cases", "successful_ratio_percentage"]
    data = []
    # Get all attribute values
    att_values = pm4py.get_trace_attribute_values(log, attribute)
    att_values_successful = pm4py.get_trace_attribute_values(log_successful, attribute)
    # Calculate num cases and successful ratio for each attribute value
    for val in att_values:
        num_traces_val = att_values[val]
        num_traces_val_successful = att_values_successful.get(val, 0)
        data.append([val, num_traces_val, num_traces_val_successful/num_traces_val*100])
    # Return a result table
    df = pd.DataFrame(columns=columns, data=data)
    return df

In [56]:
# Inspect ApplicationType
inspect_attribute(log, log_successful, "ApplicationType")

Unnamed: 0,ApplicationType,num_cases,successful_ratio_percentage
0,New credit,25939,52.511662
1,Limit raise,3128,73.241688


In [57]:
# Inspect LoanGoal
inspect_attribute(log, log_successful, "LoanGoal")

Unnamed: 0,LoanGoal,num_cases,successful_ratio_percentage
0,Existing loan takeover,5160,55.01938
1,Home improvement,7086,58.777872
2,Car,8569,51.184502
3,"Other, see explanation",2763,50.92291
4,Remaining debt home,792,64.520202
5,Not speficied,989,41.051567
6,Unknown,2165,63.510393
7,Caravan / Camper,351,57.834758
8,Tax payments,141,48.93617
9,Extra spending limit,585,53.333333


In [64]:
# Number of offers created
activities = pm4py.get_event_attribute_values(log, "concept:name")
print(f"The number of offers created in L is {activities['O_Create Offer']}.")

The number of offers created in L is 39758.


In [69]:
# Cases where O_Create Offer happened at least twice
log_repeated_offers = pm4py.filter_activities_rework(log, "O_Create Offer", 2)
print(len(log_repeated_offers))
print(f"The ratio of applications with more than one loan offer is {len(log_repeated_offers)/num_traces*100}%")

7936
The ratio of applications with more than one loan offer is 27.302439192211096%


In [71]:
# Refused offers
print(f"The number of refused offers in L is {activities['O_Refused']}.")
print(f"The ratio of refused offers is {activities['O_Refused']/activities['O_Create Offer']*100}%.")

The number of refused offers in L is 4307.
The ratio of refused offers is 10.833039891342624%.


# Question 2

 <font size="6">2a</font>

In [2]:
log_2a = pm4py.read_xes('loanApplicationsFiltered.xes.gz')

parsing log, completed traces ::   0%|          | 0/28976 [00:00<?, ?it/s]

In [3]:
import re

activities = list(pm4py.get_event_attribute_values(log_2a, "concept:name").keys())


def get_all_activities_by_types(type='A'):
    res = []
    regex = '^A_.*'
    if type == 'W':
        regex = '^W_.*'
    elif type == 'O':
        regex = '^O_.*'
    for act in activities:
        if re.match(regex, act):
            res.append(act)
    return res


A_activites = get_all_activities_by_types()
W_activites = get_all_activities_by_types('W')
O_activites = get_all_activities_by_types('O')

In [4]:
L_A = pm4py.filter_event_attribute_values(log_2a, 'concept:name', A_activites, level='event', retain=True)
L_W = pm4py.filter_event_attribute_values(log_2a, 'concept:name', W_activites, level='event', retain=True)
L_O = pm4py.filter_event_attribute_values(log_2a, 'concept:name', O_activites, level='event', retain=True)

In [5]:
def get_basic_stats(log):
    number_of_case = len(log)
    variants = pm4py.get_variants_as_tuples(log)
    number_of_variant = len(variants)
    number_of_event = 0
    for t in log:
        number_of_event += len(t)
    avg_number_of_event_per_trace = number_of_event / len(log)
    number_of_distinct_activities = len(list(pm4py.get_event_attribute_values(log, "concept:name").keys()))
    return {
        'number_of_case': number_of_case,
        'number_of_variant': number_of_variant,
        'number_of_event': number_of_event,
        'avg_number_of_event_per_trace': avg_number_of_event_per_trace,
        'number_of_distinct_activities': number_of_distinct_activities
    }


In [6]:
L_A_stats = get_basic_stats(L_A)
L_W_stats = get_basic_stats(L_W)
L_O_stats = get_basic_stats(L_O)
logs_2a_stats = pd.DataFrame([L_A_stats, L_W_stats, L_O_stats], index=['A', 'W', 'O'])
logs_2a_stats

Unnamed: 0,number_of_case,number_of_variant,number_of_event,avg_number_of_event_per_trace,number_of_distinct_activities
A,28976,88,220481,7.60909,10
W,28976,1018,175545,6.05829,6
O,28976,815,178612,6.164136,8


<font size="6">2b</font>

In [7]:
L_Os = pm4py.read_xes('./loanOffers.xes.gz')


parsing log, completed traces ::   0%|          | 0/39589 [00:00<?, ?it/s]

In [8]:
import os
if not os.path.exists('./xes'):
    os.mkdir('./xes')
pm4py.write_xes(L_O, './xes/2aO.xes')

exporting log, completed traces ::   0%|          | 0/28976 [00:00<?, ?it/s]

<font size="6">2c</font>

In [10]:
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator


def fitness_and_precision_calculator(net, initial_marking, final_marking, log_to_apply):
    try:
        aligned_traces_IM = alignments.apply_log(log_to_apply, net, initial_marking, final_marking)
        for alignment in aligned_traces_IM:
            if alignment['cost'] > 10:
                x = 3
        fitness_alignment_IM = replay_fitness.evaluate(aligned_traces_IM, variant=replay_fitness.Variants.ALIGNMENT_BASED)
        precision_alignment_IM = precision_evaluator.apply(log_to_apply, net, initial_marking, final_marking,
                                                           variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)
    except:
        fitness_alignment_IM = {'percFitTraces': 0, 'averageFitness': 0, 'percentage_of_fitting_traces': 0, 'average_trace_fitness': 0}
        precision_alignment_IM = 0
    replayed_traces_IM = token_replay.apply(log_to_apply, net, initial_marking, final_marking)
    fitness_token_replay_IM = replay_fitness.evaluate(replayed_traces_IM, variant=replay_fitness.Variants.TOKEN_BASED)
    precision_token_replay_IM = precision_evaluator.apply(log_to_apply, net, initial_marking, final_marking,
                                                          variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)

    return {
        'perc_fit_trace (alignment)': fitness_alignment_IM['percentage_of_fitting_traces'],
        'perc_fit_trace (token replay)': fitness_token_replay_IM['percentage_of_fitting_traces'],
        'average_fitness (alignment)': fitness_alignment_IM['average_trace_fitness'],
        'average_fitness (token replay)': fitness_token_replay_IM['average_trace_fitness'],
        'precision (alignment)': precision_alignment_IM,
        'precision (token replay)': precision_token_replay_IM,
    }


def simplicity_and_generalization_calculator(net, initial_marking, final_marking, log_to_apply):
    simplicity = simplicity_evaluator.apply(net)
    generalization = generalization_evaluator.apply(log_to_apply, net, initial_marking, final_marking)
    return simplicity, generalization

In [11]:
algos = ['alpha', 'im']
noise_thresholds = [0, 0.2, 0.5]
log_2c = [(L_A, 'A'), (L_W, 'W'), (L_Os, 'O')]
result_2c = {}
index_2c = []
for log in log_2c:
    pm4py.write_xes(log[0], f'./xes/L_{log[1]}.xes')
for log_to_apply in log_2c:
    for algo in algos:
        for threshold in noise_thresholds:
            net, initial_marking, final_marking = None, None, None
            if algo == 'alpha' and threshold == 0:
                net, initial_marking, final_marking = pm4py.discover_petri_net_alpha(log_to_apply[0])
            elif algo == 'im':
                net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(log_to_apply[0], threshold)
            else:
                continue

            pm4py.write_pnml(net, initial_marking, final_marking, f'./pnml/2c-{log_to_apply[1]}-{algo}-{threshold}.pnml')
            index_2c.append(f'{log_to_apply[1]}-{algo}-{threshold}')
            fitness_dict = fitness_and_precision_calculator(net, initial_marking, final_marking, log_to_apply[0])
            for key in fitness_dict:
                if key in result_2c:
                    result_2c[key].append(fitness_dict[key])
                else:
                    result_2c[key] = [fitness_dict[key]]

            simplicity, generalization = simplicity_and_generalization_calculator(net, initial_marking, final_marking, log_to_apply[0])
            if 'simplicity' in result_2c:
                result_2c['simplicity'].append(simplicity)
            else:
                result_2c['simplicity'] = [simplicity]

            if 'generalization' in result_2c:
                result_2c['generalization'].append(generalization)
            else:
                result_2c['generalization'] = [generalization]



exporting log, completed traces ::   0%|          | 0/28976 [00:00<?, ?it/s]

exporting log, completed traces ::   0%|          | 0/28976 [00:00<?, ?it/s]

exporting log, completed traces ::   0%|          | 0/39589 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/88 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1534 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1018 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/6 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

In [13]:
df_stats_2c = pd.DataFrame(result_2c, index=index_2c)
df_stats_2c

Unnamed: 0,perc_fit_trace (alignment),perc_fit_trace (token replay),average_fitness (alignment),average_fitness (token replay),precision (alignment),precision (token replay),simplicity,generalization
A-alpha-0,23.84042,23.84042,0.877945,0.872103,0.426619,0.426619,1.0,0.992186
A-im-0,100.0,100.0,1.0,1.0,0.853174,0.853174,0.714286,0.923102
A-im-0.2,99.527195,99.527195,0.999512,0.999346,0.925764,0.925764,0.735849,0.923466
A-im-0.5,12.703617,12.703617,0.90015,0.879231,0.978533,0.978533,0.8,0.901125
W-alpha-0,0.06212,0.06212,0.426083,0.49886,0.313408,0.313408,1.0,0.988321
W-im-0,100.0,100.0,1.0,1.0,0.50762,0.50762,0.677419,0.982717
W-im-0.2,99.927526,99.927526,0.999929,0.999971,0.55282,0.55282,0.694915,0.990756
W-im-0.5,24.679045,24.679045,0.868617,0.905827,0.707679,0.707679,0.72549,0.942955
O-alpha-0,0.0,0.0,0.0,0.904405,0.0,0.812486,0.555556,0.990546
O-im-0,100.0,100.0,1.0,1.0,0.780093,0.780093,0.666667,0.988802


<font size="6">2e</font>

In [40]:
net_2d_A, initial_marking_2d_A, final_marking_2d_A = pm4py.read_pnml('./pnml/2c-O-im-0.2.pnml')

In [42]:
try:
    aligned_traces_IM = alignments.apply_log(L_Os, net_2d_A, initial_marking_2d_A, final_marking_2d_A)
    for alignment in aligned_traces_IM:
        if alignment['cost'] >= 10000:
            for e in alignment['alignment']:
                print(f'{e[0]} ->', end=" ")
            break
    fitness_alignment_IM = replay_fitness.evaluate(aligned_traces_IM, variant=replay_fitness.Variants.ALIGNMENT_BASED)
    precision_alignment_IM = precision_evaluator.apply(L_A, net_2d_A, initial_marking_2d_A, final_marking_2d_A,
                                                       variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)
except:
    fitness_alignment_IM = {'percFitTraces': 0, 'averageFitness': 0, 'percentage_of_fitting_traces': 0, 'average_trace_fitness': 0}
    precision_alignment_IM = 0

aligning log, completed variants ::   0%|          | 0/12 [00:00<?, ?it/s]

O_Create Offer -> O_Created -> >> -> >> -> O_Cancelled -> 

computing precision with alignments, completed variants ::   0%|          | 0/58 [00:00<?, ?it/s]

<font size="6">2f</font>

In [12]:
activities = list(pm4py.get_event_attribute_values(log_2a, "concept:name").keys())
activities

['A_Create Application',
 'A_Submitted',
 'W_Handle leads',
 'A_Concept',
 'A_Accepted',
 'O_Create Offer',
 'O_Created',
 'O_Sent (mail and online)',
 'W_Complete application',
 'A_Complete',
 'W_Call after offers',
 'A_Validating',
 'O_Returned',
 'W_Validate application',
 'A_Incomplete',
 'W_Call incomplete files',
 'O_Accepted',
 'A_Pending',
 'A_Denied',
 'O_Refused',
 'O_Cancelled',
 'A_Cancelled',
 'O_Sent (online only)',
 'W_Assess potential fraud']

In [11]:
from pm4py.objects.petri_net.obj import PetriNet, Marking
from pm4py.objects.petri_net.utils import petri_utils

net = PetriNet("new_petri_net")
source = PetriNet.Place("source")
sink = PetriNet.Place("sink")
# add the places to the Petri Net
net.places.add(source)
net.places.add(sink)
place_dict = {}
transition_dict = {}
for i in range(2,34):
    place = PetriNet.Place(f'p{i}')
    net.places.add(place)
    place_dict[f'p{i}'] = place

for a in activities:
    t = PetriNet.Transition(a, a)
    net.transitions.add(t)
    transition_dict[a] = t

for a in range(1,15):
    t = PetriNet.Transition(f't{a}', None)
    net.transitions.add(t)
    transition_dict[a] = t


petri_utils.add_arc_from_to(source, transition_dict['A_Create Application'], net)
petri_utils.add_arc_from_to(transition_dict['A_Create Application'], place_dict['p2'], net)

petri_utils.add_arc_from_to(place_dict['p2'], transition_dict['A_Submitted'], net)
petri_utils.add_arc_from_to(transition_dict['A_Submitted'], place_dict['p3'], net)

petri_utils.add_arc_from_to(place_dict['p2'], transition_dict['t1'], net)
petri_utils.add_arc_from_to(transition_dict['t1'], place_dict['p4'], net)

petri_utils.add_arc_from_to(place_dict['p3'], transition_dict['t2'], net)
petri_utils.add_arc_from_to(transition_dict['t2'], place_dict['p4'], net)

petri_utils.add_arc_from_to(place_dict['p3'], transition_dict['W_Handle leads'], net)
petri_utils.add_arc_from_to(transition_dict['W_Handle leads'], place_dict['p4'], net)

petri_utils.add_arc_from_to(place_dict['p3'], transition_dict['t3'], net)
petri_utils.add_arc_from_to(transition_dict['W_Handle leads'], place_dict['p4'], net)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
