# Prozess Causality Project
## Environment Setup

In [1]:
import pandas as pd
import pm4py as pm
from path import path

xes_directory = path/'xes_logs'
csv_directory = path/'csv_logs'
case_tables_directory = path/'case_tables'

unchanged_process_xes = xes_directory/'financial_log.xes.gz'
changed_process_xes = xes_directory/'BPI Challenge 2017.xes.gz'

## Loading
First of all we should load our event logs. Additionally the loaded logs will be transformed to a pandas dataframe and saved as a csv file.

In [2]:
unchanged_log = pm.read_xes(str(unchanged_process_xes))
changed_log = pm.read_xes(str(changed_process_xes))

unchanged_frame = pm.convert_to_dataframe(unchanged_log)
changed_frame = pm.convert_to_dataframe(changed_log)

unchanged_frame.to_csv(csv_directory/(unchanged_process_xes.stem+'.csv'))
changed_frame.to_csv(csv_directory/(changed_process_xes.stem+'.csv'))

parsing log, completed traces :: 100%|██████████| 13087/13087 [00:05<00:00, 2237.70it/s]
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:35<00:00, 880.82it/s]


## Case Tables
The next thing to do is to create a case table, where the case informations are mapped to a kpi. For example we are using the absolute passed time from start of the process to end of the process.

In [5]:

case_id = 'case:concept:name'
activity_names = 'concept:name'

unchanged_cases = unchanged_frame[case_id].unique().tolist()
unchanged_activities = unchanged_frame[activity_names].drop_duplicates()

changed_cases = changed_frame[case_id].unique().tolist()
changed_activities = changed_frame[activity_names].drop_duplicates()

def to_case_table(cases, activities, frame):
    case_table = []
    for case in cases:
        case_log = frame[frame[case_id]==case]
        time = (case_log['time:timestamp'].max()-case_log['time:timestamp'].min()).total_seconds()/60/60
        keys = activities.to_list()
        values = activities.isin(case_log[activity_names]).to_list()
        for key in pd.Series(keys)[values]:
            values[keys.index(key)] = len(case_log[case_log[activity_names]==key])
        for i in range(0,len(values)):
            values[i] = int(values[i])
        case_entry = dict(zip(['case_id']+keys+['time'],[case]+values+[time]))
        case_table.append(case_entry)
    return pd.DataFrame(case_table)

unchanged_case_table = pd.DataFrame(to_case_table(unchanged_cases, unchanged_activities, unchanged_frame))
unchanged_case_table.to_csv(case_tables_directory/(unchanged_process_xes.stem+'case_table.csv'))

changed_case_table = pd.DataFrame(to_case_table(changed_cases, changed_activities, changed_frame))
changed_case_table.to_csv(case_tables_directory/(changed_process_xes.stem+'case_table.csv'))