In [1]:
#from pandas import set_option
#set_option('display.expand_frame_repr', False)

# Prozess Causality Project
## First attempts
The basic idea is to compare two processes of the same origin. To do this, we first examined the resources publicly available on the Internet. Since this use case is rather uncommon, we found that the possibilities are quickly exhausted. However, the research and feedback revealed that probably the BPI, which organizes an annual challange, used the same data source twice. Under this pretext, we were able to find the following sources:

| Challange | Link | File |
|:--- |:--- |:--- |
| 2012 | https://www.win.tue.nl/bpi/doku.php?id=2012:challenge | financial_log.xes.gz |
| 2017 | https://www.win.tue.nl/bpi/doku.php?id=2017:challenge | BPI Challenge 2017.xes.gz |


In [2]:
from pm4py import read_xes
from pm4py import convert_to_dataframe as as_frame
from environment import *
bpi2012 = as_frame(read_xes(str(XES_LOGS_DIR_PATH/'financial_log.xes.gz')))
bpi2017 = as_frame(read_xes(str(XES_LOGS_DIR_PATH/'BPI Challenge 2017.xes.gz')))
print(bpi2012)
print(bpi2017)

parsing log, completed traces :: 100%|██████████| 13087/13087 [00:05<00:00, 2217.34it/s]
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:36<00:00, 868.88it/s] 


       org:resource lifecycle:transition            concept:name  \
0               112             COMPLETE             A_SUBMITTED   
1               112             COMPLETE       A_PARTLYSUBMITTED   
2               112             COMPLETE           A_PREACCEPTED   
3               112             SCHEDULE  W_Completeren aanvraag   
4               NaN                START  W_Completeren aanvraag   
...             ...                  ...                     ...   
262195          112             COMPLETE       A_PARTLYSUBMITTED   
262196          112             SCHEDULE      W_Afhandelen leads   
262197        11169                START      W_Afhandelen leads   
262198        11169             COMPLETE              A_DECLINED   
262199        11169             COMPLETE      W_Afhandelen leads   

                          time:timestamp                     case:REG_DATE  \
0       2011-10-01 00:38:44.546000+02:00  2011-10-01 00:38:44.546000+02:00   
1       2011-10-01 00:38:44

To get a better understanding of this processes it will be helpful to display the activities. According to pm4py the activities are marked as `'concept:name'` and the cases are marked as `'case:concept:name'`

In [3]:

case_id = 'case:concept:name'
activity_id = 'concept:name'

print('bpi2012: num of cases:',len(bpi2012[case_id].unique()))
print('bpi2017: num of cases:',len(bpi2017[case_id].unique()))
bpi2012_activities = bpi2012[activity_id].unique()
bpi2017_activities = bpi2017[activity_id].unique()
print('bpi2012: num of possible activities:',len(bpi2012_activities))
print('bpi2017: num of possible activities:',len(bpi2017_activities))
print(sorted(bpi2012_activities))
print(sorted(bpi2017_activities))

bpi2012: num of cases: 13087
bpi2017: num of cases: 31509
bpi2012: num of possible activities: 24
bpi2017: num of possible activities: 26
['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'A_REGISTERED', 'A_SUBMITTED', 'O_ACCEPTED', 'O_CANCELLED', 'O_CREATED', 'O_DECLINED', 'O_SELECTED', 'O_SENT', 'O_SENT_BACK', 'W_Afhandelen leads', 'W_Beoordelen fraude', 'W_Completeren aanvraag', 'W_Nabellen incomplete dossiers', 'W_Nabellen offertes', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens']
['A_Accepted', 'A_Cancelled', 'A_Complete', 'A_Concept', 'A_Create Application', 'A_Denied', 'A_Incomplete', 'A_Pending', 'A_Submitted', 'A_Validating', 'O_Accepted', 'O_Cancelled', 'O_Create Offer', 'O_Created', 'O_Refused', 'O_Returned', 'O_Sent (mail and online)', 'O_Sent (online only)', 'W_Assess potential fraud', 'W_Call after offers', 'W_Call incomplete files', 'W_Complete application', 'W_Handle leads', 'W_Personal L

With so much possible activities and diffrences in language between the two processes it is really hard to understand the changes in the process. For this project it may take too lang to fully understand this processes. So we decided to use an other approach.

## Simulation
To get two versions of a process we decided to create our own processes. Therefor we builded two bpmn models representing a basic version of an order to cash process and a changed version. For better understanding of the process we provide a so called ruleset representing the activities.

In [4]:
unchanged_basic_ruleset = [
    "Check stock availability",
    "Check raw materials availabilty",
    (
        [
            "Request raw materials from Supplier 1",
            "Obtain raw materials from Supplier 1"
        ],
        [
            "Request raw materials from Supplier 2",
            "Obtain raw materials from Supplier 2"
        ]
    ),
    "Manufacture product",
    "Retrieve product from warehouse",
    "Confirm order",
    (
        [
            "Get shipping address",
            "Ship product"
        ],
        [
            "Emit invoice",
            "Receive Payment"
        ]
    ),
    "Archieve order"
]
changed_basic_ruleset = [
    "Check stock availability",
    "Check raw materials availabilty",
    "Notify unavailability to customer",
    (
        "Start request raw materials from Supplier 1",
        "Start request raw materials from Supplier 2"
    ),
    (
        "Obtain raw materials from Supplier 1",
        "Obtain raw materials from Supplier 2"
    ),
    "Manufacture product",
    "Retrieve product from warehouse",
    "Confirm order",
    "Get shipping address",
    (
        "Ship product",
        [
            "Emit invoice",
            "Receive Payment"
        ]
    ),
    "Archieve order"
]
print(unchanged_basic_ruleset)
print(changed_basic_ruleset)

['Check stock availability', 'Check raw materials availabilty', (['Request raw materials from Supplier 1', 'Obtain raw materials from Supplier 1'], ['Request raw materials from Supplier 2', 'Obtain raw materials from Supplier 2']), 'Manufacture product', 'Retrieve product from warehouse', 'Confirm order', (['Get shipping address', 'Ship product'], ['Emit invoice', 'Receive Payment']), 'Archieve order']
['Check stock availability', 'Check raw materials availabilty', 'Notify unavailability to customer', ('Start request raw materials from Supplier 1', 'Start request raw materials from Supplier 2'), ('Obtain raw materials from Supplier 1', 'Obtain raw materials from Supplier 2'), 'Manufacture product', 'Retrieve product from warehouse', 'Confirm order', 'Get shipping address', ('Ship product', ['Emit invoice', 'Receive Payment']), 'Archieve order']


These activities define two simple, but diffrent processes. For experimentation we can now load the bpmn's and simulate some event logs.

In [5]:
from source.misc import read_bpmn
from source.simulation import basic_bpmn_petri_net

unchanged_bpmn = read_bpmn(BPMN_DIR_PATH,'Order-to-Cash-Model-1.bpmn')
changed_bpmn = read_bpmn(BPMN_DIR_PATH,'Order-to-Cash-Model-2.bpmn')

unchanged_eventlog = basic_bpmn_petri_net(unchanged_bpmn)
changed_eventlog = basic_bpmn_petri_net(changed_bpmn)

print(unchanged_eventlog)
print(changed_eventlog)

                          concept:name      time:timestamp case:concept:name
0             Check stock availability 1970-04-26 19:46:40             C0000
1      Retrieve product from warehouse 1970-04-26 19:46:41             C0000
2                        Confirm order 1970-04-26 19:46:42             C0000
3                         Emit invoice 1970-04-26 19:46:43             C0000
4                 Get shipping address 1970-04-26 19:46:44             C0000
...                                ...                 ...               ...
10555             Get shipping address 1970-04-26 22:42:35             C0999
10556                     Emit invoice 1970-04-26 22:42:36             C0999
10557                     Ship product 1970-04-26 22:42:37             C0999
10558                  Receive Payment 1970-04-26 22:42:38             C0999
10559                   Archieve order 1970-04-26 22:42:39             C0999

[10560 rows x 3 columns]
                                     concept:name 

Despite having courios timestamps, both processes are simulated according to the bpmn. If we now apply some scenario data for the processes, we can get a more realistic version. But fist of all, let us have a look at the scenarios.

In [6]:
from source.misc import get_scenario

unchanged_scenario = get_scenario(SIMULATION_DATA_DIR_PATH, 'Order-to-Cash_unchanged.csv')
changed_scenario = get_scenario(SIMULATION_DATA_DIR_PATH, 'Order-to-Cash_changed.csv')

print(unchanged_scenario)
print(changed_scenario)

{'time': {'apply_to': None, 'functions': {'Check stock availability': <function get_scenario.<locals>.<lambda> at 0x000001C6254833A0>, 'Check raw materials availabilty': <function get_scenario.<locals>.<lambda> at 0x000001C73537BAF0>, 'Request raw materials from Supplier 1': <function get_scenario.<locals>.<lambda> at 0x000001C73537BDC0>, 'Request raw materials from Supplier 2': <function get_scenario.<locals>.<lambda> at 0x000001C73537BEE0>, 'Obtain raw materials from Supplier 1': <function get_scenario.<locals>.<lambda> at 0x000001C73275B040>, 'Obtain raw materials from Supplier 2': <function get_scenario.<locals>.<lambda> at 0x000001C73275B160>, 'Manufacture product': <function get_scenario.<locals>.<lambda> at 0x000001C73275B280>, 'Retrieve product from warehouse': <function get_scenario.<locals>.<lambda> at 0x000001C73275B3A0>, 'Confirm order': <function get_scenario.<locals>.<lambda> at 0x000001C73275B4C0>, 'Get shipping address': <function get_scenario.<locals>.<lambda> at 0x000

It is hard to see, but all activities have been assigned functions to simulate the behavior in a process run. If we now applying these methods we obtaining a more realistic eventlog.

In [7]:
from source.operation import apply_scenario

unchanged_eventlog = apply_scenario(unchanged_eventlog, unchanged_scenario, activity_id)
changed_eventlog = apply_scenario(changed_eventlog, changed_scenario, activity_id)
print(unchanged_eventlog)
print(changed_eventlog)

                          concept:name      time:timestamp case:concept:name  \
0             Check stock availability 1970-04-26 19:46:40             C0000   
1      Retrieve product from warehouse 1970-04-26 19:46:41             C0000   
2                        Confirm order 1970-04-26 19:46:42             C0000   
3                         Emit invoice 1970-04-26 19:46:43             C0000   
4                 Get shipping address 1970-04-26 19:46:44             C0000   
...                                ...                 ...               ...   
10555             Get shipping address 1970-04-26 22:42:35             C0999   
10556                     Emit invoice 1970-04-26 22:42:36             C0999   
10557                     Ship product 1970-04-26 22:42:37             C0999   
10558                  Receive Payment 1970-04-26 22:42:38             C0999   
10559                   Archieve order 1970-04-26 22:42:39             C0999   

           time      cost  
0      0.01

To get now a more machine learning convenient view, we can now change the eventlogs into case tables.

In [8]:
from source.operation import to_case_table

unchanged_case_table = to_case_table(unchanged_eventlog, case_id, activity_id, fillna=0, aggregate={'cost':'sum','time':'sum'})
changed_case_table = to_case_table(changed_eventlog, case_id, activity_id, fillna=0, aggregate={'cost':'sum','time':'sum'})

print(unchanged_case_table)
print(changed_case_table)

                   cost Archieve order  cost Check raw materials availabilty  \
case:concept:name                                                              
C0000                         1.833333                              0.000000   
C0001                         1.833333                              0.000000   
C0002                         1.833333                              1.833333   
C0003                         1.833333                              1.833333   
C0004                         1.833333                              0.000000   
...                                ...                                   ...   
C0995                         1.833333                              0.000000   
C0996                         1.833333                              0.000000   
C0997                         1.833333                              1.833333   
C0998                         1.833333                              0.000000   
C0999                         1.833333  

Finally, we can apply the defined rules and calculate the result.

In [9]:
from source.operation import calculate_outcome

unchanged_ruleset = {'time':unchanged_basic_ruleset,'cost':None}
changed_ruleset = {'time':changed_basic_ruleset,'cost':None}

unchanged_case_table = calculate_outcome(unchanged_case_table, unchanged_ruleset)
changed_case_table = calculate_outcome(changed_case_table, changed_ruleset)

print(unchanged_case_table)
print(changed_case_table)

unchanged_case_table.to_csv(CASE_TABLE_DIR_PATH/'unchanged.csv', index=False)
changed_case_table.to_csv(CASE_TABLE_DIR_PATH/'changed.csv', index=False)

    case:concept:name  cost Archieve order  \
0               C0000             1.833333   
1               C0001             1.833333   
2               C0002             1.833333   
3               C0003             1.833333   
4               C0004             1.833333   
..                ...                  ...   
995             C0995             1.833333   
996             C0996             1.833333   
997             C0997             1.833333   
998             C0998             1.833333   
999             C0999             1.833333   

     cost Check raw materials availabilty  cost Check stock availability  \
0                                0.000000                       1.833333   
1                                0.000000                       1.833333   
2                                1.833333                       1.833333   
3                                1.833333                       1.833333   
4                                0.000000                       1.8

## Causality
Now we have the data we need. So it is time to explain the idea behind it. For this we use the so-called "Double Machine Learning". The basic idea is that the prediction can be used as a guide for causality testing. In our case, we will try to compare the two different processes under the same conditions and finally explain the difference in the KPI using the changes in the process. The background is explained below.
### Variables
| Variable | Description |
| --- | --- |
| t | time |
| c | costs |
| x | generic features |
| n | particular change |
| d<sub>n</sub> | feature of a change |
| p | &sum;(n) representing the process &rarr; e{0;1} |
### Assumption
The first assumptions that need to be made are those that represent the KPI's. In this case, these are `c` and `t`. So we can assume that the result is calculated by a function `f` which takes the generic characteristics `x` as input. In addition, the result changes due to the changes `p` made. This is achieved by adding the function `g` which uses `p` as input.</br> 
t(x) = f<sub>t</sub>(x) + g<sub>t</sub>(p)</br>
c(x) = f<sub>c</sub>(x) + g<sub>c</sub>(p)</br>
Using the example `c`, it must be explained for the next assumptions that the result does not change if the process is not changed.</br>
g(0) = 0</br>
However, on the other hand, it is true that the function `g` is the sum of all the functions of the changes of the process.</br>
g(p) = &sum;(d<sub>n</sub>)
### Procedure
In order to prove causality, it is necessary to define the actual results as predictions of a model `m`.</br>
c<sub>p=0</sub> = m<sub>p=0</sub>(x)</br>
In the next step it is necessary to determine the difference &Delta; between the prediction of the model and the results of the changed process c<sub>p=1</sub>. This represents the change in the KPI that resulted from the change in the process.</br>
&Delta;<sub>c</sub> = c<sub>p=0</sub> - c<sub>p=1</sub></br>
Finally, another model `M` is used to try to determine the change in KPI based on the changes `g(1)`. The better this succeeds, i.e. the higher this accuracy is, the more one can speak of a causal relationship.</br>
causality &equiv; accuracy(M<sub>c</sub>(g(1))&rarr;&Delta;<sub>c</sub>)</br>
In addition, under the following assumption, each individual change can also be checked.</br>
causality &equiv; &sum;<sup>n</sup>accuracy(M<sub>c<sub>n</sub></sub>(d(n))&rarr;&Delta;<sub>c<sub>n</sub></sub>)

Sources:</br>
https://link.springer.com/article/10.1365/s40702-019-00557-y </br>
https://ichi.pro/de/1-1-bessere-entscheidungsfindung-wenn-kausale-folgerung-auf-maschinelles-lernen-trifft-208061114886251

## Machine Learning
However, in order to implement our idea, it still needs preparation. Since machine learning is always involved in the end, it is necessary to take a closer look at the data and, if necessary, to process it further.
### Preprocessing

In [10]:
print(unchanged_case_table.describe())
print(changed_case_table.describe())

       cost Archieve order  cost Check raw materials availabilty  \
count         1.000000e+03                           1000.000000   
mean          1.833333e+00                              0.938667   
std           4.443114e-16                              0.916861   
min           1.833333e+00                              0.000000   
25%           1.833333e+00                              0.000000   
50%           1.833333e+00                              1.833333   
75%           1.833333e+00                              1.833333   
max           1.833333e+00                              1.833333   

       cost Check stock availability  cost Confirm order  cost Emit invoice  \
count                   1.000000e+03        1.000000e+03       1.000000e+03   
mean                    1.833333e+00        1.833333e+00       1.833333e+00   
std                     4.443114e-16        4.443114e-16       4.443114e-16   
min                     1.833333e+00        1.833333e+00       1.833333

As can be seen, there are features on both sides which have no standard deviation or, due to the presentation, a standard deviation close to zero. In addition, it is known that some features can carry the same information due to the way they are presented. This happens when a process step is always performed the same number of times and under the same times and costs (e.g. the automatic sending of invoices). Therefore, it must be checked whether there are features that carry identical information on an aligned scale.

In [11]:
from source.features import prepare_features
prepared_unchanged_case_table, prepared_changed_case_table = prepare_features(unchanged_case_table, changed_case_table)
print(prepared_unchanged_case_table.describe())
print(prepared_changed_case_table.describe())

cost Archieve order                              0.000
cost Check raw materials availabilty             0.841
cost Check stock availability                    0.000
cost Confirm order                               0.000
cost Emit invoice                                0.000
cost Get shipping address                        0.000
cost Manufacture product                         0.841
cost Obtain raw materials from Supplier 1        0.840
cost Obtain raw materials from Supplier 2        0.847
cost Receive Payment                             0.000
cost Request raw materials from Supplier 1       0.839
cost Request raw materials from Supplier 2       0.847
cost Retrieve product from warehouse             0.844
cost Ship product                                0.007
time Archieve order                              0.000
time Check raw materials availabilty             0.000
time Check stock availability                    0.000
time Confirm order                               0.000
time Emit 

### Validation
The next step is to divide the features into generic and changed features. The generic characteristics describe the information that is absolutely necessary to represent the process as a model. The changed characteristics are accordingly necessary for the representation of the change.

In [12]:
generic_features = prepared_unchanged_case_table.drop(columns=['case:concept:name','time','cost']).columns.to_list()
for feature in generic_features:
    print(feature)

cost Manufacture product
cost Obtain raw materials from Supplier 1
cost Obtain raw materials from Supplier 2
cost Request raw materials from Supplier 1
cost Request raw materials from Supplier 2
cost Retrieve product from warehouse
cost Ship product
Num of Check raw materials availabilty
Num of Manufacture product
Num of Obtain raw materials from Supplier 1
Num of Obtain raw materials from Supplier 2
Num of Request raw materials from Supplier 1
Num of Request raw materials from Supplier 2
Num of Retrieve product from warehouse


In [13]:
changed_features = prepared_changed_case_table.drop(columns=['case:concept:name','time','cost']+generic_features).columns.to_list()
for feature in changed_features:
    print(feature)

cost Emit invoice
cost Get shipping address
cost Receive Payment
cost Start request raw materials from Supplier 1
cost Start request raw materials from Supplier 2
Num of Archieve order
Num of Confirm order
Num of Emit invoice
Num of Get shipping address
Num of Notify unavailability to customer
Num of Receive Payment
Num of Ship product
Num of Start request raw materials from Supplier 1
Num of Start request raw materials from Supplier 2


### Causality Checking
Now we have made all the preparations to start the actual causality check. For this we use the generic and modified feature, as well as the associated data. The only thing left to do is to choose a model. In this case, we have chosen the simplest type of regression, linear regression. For this we can use the implementation of Sklearn. In addition, all Sklearn compliant estimators are supported.

In [None]:
from source.causality import full_check
from sklearn.linear_model import LinearRegression
causal_accuracy_time = full_check(LinearRegression(), prepared_unchanged_case_table, prepared_changed_case_table, 'time', generic_features, changed_features)
print('Accurarcy time:', causal_accuracy_time)
causal_accuracy_cost = full_check(LinearRegression(), prepared_unchanged_case_table, prepared_changed_case_table, 'cost', generic_features, changed_features)
print('Accurarcy cost:', causal_accuracy_cost)