In [1]:
from pandas import set_option
set_option('display.expand_frame_repr', False)

# Prozess Causality Project
## First attempts
The basic idea is to compare two processes of the same origin. To do this, we first examined the resources publicly available on the Internet. Since this use case is rather uncommon, we found that the possibilities are quickly exhausted. However, the research and feedback revealed that probably the BPI, which organizes an annual challange, used the same data source twice. Under this pretext, we were able to find the following sources:

| Challange | Link | File |
|:--- |:--- |:--- |
| 2012 | https://www.win.tue.nl/bpi/doku.php?id=2012:challenge | financial_log.xes.gz |
| 2017 | https://www.win.tue.nl/bpi/doku.php?id=2017:challenge | BPI Challenge 2017.xes.gz |


In [2]:
from pm4py import read_xes
from pm4py import convert_to_dataframe as as_frame
from environment import *
bpi2012 = as_frame(read_xes(str(XES_LOGS_DIR_PATH/'financial_log.xes.gz')))
bpi2017 = as_frame(read_xes(str(XES_LOGS_DIR_PATH/'BPI Challenge 2017.xes.gz')))
print(bpi2012)
print(bpi2017)

parsing log, completed traces :: 100%|██████████| 13087/13087 [00:06<00:00, 2178.26it/s]
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:36<00:00, 861.63it/s] 


       org:resource lifecycle:transition            concept:name                    time:timestamp                     case:REG_DATE case:concept:name case:AMOUNT_REQ
0               112             COMPLETE             A_SUBMITTED  2011-10-01 00:38:44.546000+02:00  2011-10-01 00:38:44.546000+02:00            173688           20000
1               112             COMPLETE       A_PARTLYSUBMITTED  2011-10-01 00:38:44.880000+02:00  2011-10-01 00:38:44.546000+02:00            173688           20000
2               112             COMPLETE           A_PREACCEPTED  2011-10-01 00:39:37.906000+02:00  2011-10-01 00:38:44.546000+02:00            173688           20000
3               112             SCHEDULE  W_Completeren aanvraag  2011-10-01 00:39:38.875000+02:00  2011-10-01 00:38:44.546000+02:00            173688           20000
4               NaN                START  W_Completeren aanvraag  2011-10-01 11:36:46.437000+02:00  2011-10-01 00:38:44.546000+02:00            173688           2000

To get a better understanding of this processes it will be helpful to display the activities. According to pm4py the activities are marked as `'concept:name'` and the cases are marked as `'case:concept:name'`

In [3]:

case_id = 'case:concept:name'
activity_id = 'concept:name'

print('bpi2012: num of cases:',len(bpi2012[case_id].unique()))
print('bpi2017: num of cases:',len(bpi2017[case_id].unique()))
bpi2012_activities = bpi2012[activity_id].unique()
bpi2017_activities = bpi2017[activity_id].unique()
print('bpi2012: num of possible activities:',len(bpi2012_activities))
print('bpi2017: num of possible activities:',len(bpi2017_activities))
print(sorted(bpi2012_activities))
print(sorted(bpi2017_activities))

bpi2012: num of cases: 13087
bpi2017: num of cases: 31509
bpi2012: num of possible activities: 24
bpi2017: num of possible activities: 26
['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED', 'O_ACCEPTED', 'O_CANCELLED', 'O_CREATED', 'O_DECLINED', 'O_SELECTED', 'O_SENT', 'O_SENT_BACK', 'W_Afhandelen leads', 'W_Beoordelen fraude', 'W_Completeren aanvraag', 'W_Nabellen incomplete dossiers', 'W_Nabellen offertes', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens']
['A_Accepted', 'A_Cancelled', 'A_Complete', 'A_Concept', 'A_Create Application', 'A_Denied', 'A_Incomplete', 'A_Pending', 'A_Submitted', 'A_Validating', 'O_Accepted', 'O_Cancelled', 'O_Create Offer', 'O_Created', 'O_Refused', 'O_Returned', 'O_Sent (mail and online)', 'O_Sent (online only)', 'W_Assess potential fraud', 'W_Call after offers', 'W_Call incomplete files', 'W_Complete application', 'W_Handle leads', 'W_Personal L

With so much possible activities and diffrences in language between the two processes it is really hard to understand the changes in the process. For this project it may take too lang to fully understand this processes. So we decided to use an other approach.

## Simulation
To get two versions of a process we decided to create our own processes. Therefor we builded two bpmn models representing a basic version of an order to cash process and a changed version. For better understanding of the process we provide a so called ruleset representing the activities.

In [4]:
unchanged_basic_ruleset = [
    "Check stock availability",
    "Check raw materials availabilty",
    (
        [
            "Request raw materials from Supplier 1",
            "Obtain raw materials from Supplier 1"
        ],
        [
            "Request raw materials from Supplier 2",
            "Obtain raw materials from Supplier 2"
        ]
    ),
    "Manufacture product",
    "Retrieve product from warehouse",
    "Confirm order",
    (
        [
            "Get shipping address",
            "Ship product"
        ],
        [
            "Emit invoice",
            "Receive Payment"
        ]
    ),
    "Archieve order"
]
changed_basic_ruleset = [
    "Check stock availability",
    "Check raw materials availabilty",
    "Notify unavailability to customer",
    (
        "Start request raw materials from Supplier 1",
        "Start request raw materials from Supplier 2"
    ),
    (
        "Obtain raw materials from Supplier 1",
        "Obtain raw materials from Supplier 2"
    ),
    "Manufacture product",
    "Retrieve product from warehouse",
    "Confirm order",
    "Get shipping address",
    (
        "Ship product",
        [
            "Emit invoice",
            "Receive Payment"
        ]
    ),
    "Archieve order"
]
print(unchanged_basic_ruleset)
print(changed_basic_ruleset)

['Check stock availability', 'Check raw materials availabilty', (['Request raw materials from Supplier 1', 'Obtain raw materials from Supplier 1'], ['Request raw materials from Supplier 2', 'Obtain raw materials from Supplier 2']), 'Manufacture product', 'Retrieve product from warehouse', 'Confirm order', (['Get shipping address', 'Ship product'], ['Emit invoice', 'Receive Payment']), 'Archieve order']
['Check stock availability', 'Check raw materials availabilty', 'Notify unavailability to customer', ('Start request raw materials from Supplier 1', 'Start request raw materials from Supplier 2'), ('Obtain raw materials from Supplier 1', 'Obtain raw materials from Supplier 2'), 'Manufacture product', 'Retrieve product from warehouse', 'Confirm order', 'Get shipping address', ('Ship product', ['Emit invoice', 'Receive Payment']), 'Archieve order']


These activities define two simple, but diffrent processes. For experimentation we can now load the bpmn's and simulate some event logs.

In [5]:
from source.misc import read_bpmn
from source.simulation import basic_bpmn_petri_net

unchanged_bpmn = read_bpmn(BPMN_DIR_PATH,'Order-to-Cash-Model-1.bpmn')
changed_bpmn = read_bpmn(BPMN_DIR_PATH,'Order-to-Cash-Model-2.bpmn')

unchanged_eventlog = basic_bpmn_petri_net(unchanged_bpmn)
changed_eventlog = basic_bpmn_petri_net(changed_bpmn)

print(unchanged_eventlog)
print(changed_eventlog)

                                concept:name      time:timestamp case:concept:name
0                   Check stock availability 1970-04-26 19:46:40             C0000
1            Check raw materials availabilty 1970-04-26 19:46:41             C0000
2      Request raw materials from Supplier 2 1970-04-26 19:46:42             C0000
3       Obtain raw materials from Supplier 2 1970-04-26 19:46:43             C0000
4      Request raw materials from Supplier 1 1970-04-26 19:46:44             C0000
...                                      ...                 ...               ...
10535                           Emit invoice 1970-04-26 22:42:15             C0999
10536                   Get shipping address 1970-04-26 22:42:16             C0999
10537                           Ship product 1970-04-26 22:42:17             C0999
10538                        Receive Payment 1970-04-26 22:42:18             C0999
10539                         Archieve order 1970-04-26 22:42:19             C0999

[10

Despite having courios timestamps, both processes are simulated according to the bpmn. If we now apply some scenario data for the processes, we can get a more realistic version. But fist of all, let us have a look at the scenarios.

In [6]:
from source.misc import get_scenario

unchanged_scenario = get_scenario(SIMULATION_DATA_DIR_PATH, 'Order-to-Cash_unchanged.csv')
changed_scenario = get_scenario(SIMULATION_DATA_DIR_PATH, 'Order-to-Cash_changed.csv')

print(unchanged_scenario)
print(changed_scenario)

{'time': {'apply_to': None, 'functions': {'Check stock availability': <function get_scenario.<locals>.<lambda> at 0x000002060D0F6160>, 'Check raw materials availabilty': <function get_scenario.<locals>.<lambda> at 0x00000207341019D0>, 'Request raw materials from Supplier 1': <function get_scenario.<locals>.<lambda> at 0x0000020734101D30>, 'Request raw materials from Supplier 2': <function get_scenario.<locals>.<lambda> at 0x0000020734101E50>, 'Obtain raw materials from Supplier 1': <function get_scenario.<locals>.<lambda> at 0x0000020734101F70>, 'Obtain raw materials from Supplier 2': <function get_scenario.<locals>.<lambda> at 0x000002072E0680D0>, 'Manufacture product': <function get_scenario.<locals>.<lambda> at 0x000002072E0681F0>, 'Retrieve product from warehouse': <function get_scenario.<locals>.<lambda> at 0x000002072E068310>, 'Confirm order': <function get_scenario.<locals>.<lambda> at 0x000002072E068430>, 'Get shipping address': <function get_scenario.<locals>.<lambda> at 0x000

It is hard to see, but all activities have been assigned functions to simulate the behavior in a process run. If we now applying these methods we obtaining a more realistic eventlog.

In [7]:
from source.operation import apply_scenario

unchanged_eventlog = apply_scenario(unchanged_eventlog, unchanged_scenario, activity_id)
changed_eventlog = apply_scenario(changed_eventlog, changed_scenario, activity_id)
print(unchanged_eventlog)
print(changed_eventlog)

                                concept:name      time:timestamp case:concept:name      time      cost
0                   Check stock availability 1970-04-26 19:46:40             C0000  0.016667  1.833333
1            Check raw materials availabilty 1970-04-26 19:46:41             C0000  0.016667  1.833333
2      Request raw materials from Supplier 2 1970-04-26 19:46:42             C0000  0.015661  1.783049
3       Obtain raw materials from Supplier 2 1970-04-26 19:46:43             C0000  0.014359  1.717951
4      Request raw materials from Supplier 1 1970-04-26 19:46:44             C0000  0.014720  1.736022
...                                      ...                 ...               ...       ...       ...
10535                           Emit invoice 1970-04-26 22:42:15             C0999  0.016667  1.833333
10536                   Get shipping address 1970-04-26 22:42:16             C0999  0.016667  1.833333
10537                           Ship product 1970-04-26 22:42:17         

To get now a more machine learning convenient view, we can now change the eventlogs into case tables.

In [8]:
from source.operation import to_case_table

unchanged_case_table = to_case_table(unchanged_eventlog, case_id, activity_id, fillna=0, aggregate={'cost':'sum','time':'sum'})
changed_case_table = to_case_table(changed_eventlog, case_id, activity_id, fillna=0, aggregate={'cost':'sum','time':'sum'})

print(unchanged_case_table)
print(changed_case_table)

                   cost Archieve order  cost Check raw materials availabilty  cost Check stock availability  cost Confirm order  cost Emit invoice  cost Get shipping address  cost Manufacture product  cost Obtain raw materials from Supplier 1  cost Obtain raw materials from Supplier 2  cost Receive Payment  ...  Num of Emit invoice  Num of Get shipping address  Num of Manufacture product  Num of Obtain raw materials from Supplier 1  Num of Obtain raw materials from Supplier 2  Num of Receive Payment  Num of Request raw materials from Supplier 1  Num of Request raw materials from Supplier 2  Num of Retrieve product from warehouse  Num of Ship product
case:concept:name                                                                                                                                                                                                                                                                                                    ...                              

Finally, we can apply the defined rules and calculate the result.

In [9]:
from source.operation import calculate_outcome

unchanged_ruleset = {'time':unchanged_basic_ruleset,'cost':None}
changed_ruleset = {'time':changed_basic_ruleset,'cost':None}

unchanged_case_table = calculate_outcome(unchanged_case_table, unchanged_ruleset)
changed_case_table = calculate_outcome(changed_case_table, changed_ruleset)

print(unchanged_case_table)
print(changed_case_table)

unchanged_case_table.to_csv(CASE_TABLE_DIR_PATH/'unchanged.csv', index=False)
changed_case_table.to_csv(CASE_TABLE_DIR_PATH/'changed.csv', index=False)

    case:concept:name  cost Archieve order  cost Check raw materials availabilty  cost Check stock availability  cost Confirm order  cost Emit invoice  cost Get shipping address  cost Manufacture product  cost Obtain raw materials from Supplier 1  cost Obtain raw materials from Supplier 2  ...  Num of Manufacture product  Num of Obtain raw materials from Supplier 1  Num of Obtain raw materials from Supplier 2  Num of Receive Payment  Num of Request raw materials from Supplier 1  Num of Request raw materials from Supplier 2  Num of Retrieve product from warehouse  Num of Ship product      time       cost
0               C0000             1.833333                              1.833333                       1.833333            1.833333           1.833333                   1.833333                  1.945955                                   1.827614                                   1.717951  ...                         1.0                                          1.0                      