In [1]:
logName = "minit_invoice_10"

In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv("../logdata/%s.csv" % logName, sep=";")
case_id_col = 'Case ID'
timestamp_col = "Complete Timestamp"

In [3]:
data.shape

(62740, 19)

In [4]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])

In [5]:
def split_data(data, train_ratio):
    # split into train and test using temporal split

    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(np.ceil(train_ratio*len(start_timestamps)))]
    train = data[data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')

    return (train, test)

In [6]:
_, test = split_data(data, train_ratio=0.60)

In [7]:
test.shape

(24896, 19)

In [8]:
min_prefix_length = 2
max_prefix_length = 20

In [10]:
test.head()

Unnamed: 0,CostCenter.Code,Supplier.City,Supplier.Name,Supplier.State,InvoiceTotalAmountWithoutVAT,Case ID,Activity,Resource,ActivityFinalAction,EventType,Complete Timestamp,month,weekday,hour,timesincelastevent,timesincecasestart,remtime,event_nr,open_cases
36457,CostCenter1,Oevel,Ut Consulting,Bulgaria,2093,983425,Start,Server,missing,0,2015-03-17 11:48:16,3,1,11,0.0,0.0,80591.0,1,380
36458,CostCenter1,Oevel,Ut Consulting,Bulgaria,2093,983425,Process_start,Server,missing,1,2015-03-17 11:48:16,3,1,11,0.0,0.0,80591.0,2,381
36459,CostCenter1,Oevel,Ut Consulting,Bulgaria,2093,983425,Status_change_to_Being_Approved,Server,missing,1,2015-03-17 11:48:17,3,1,11,1.0,1.0,80590.0,3,381
36460,CostCenter1,Oevel,Ut Consulting,Bulgaria,2093,983425,Check_order_numbers,Server,missing,1,2015-03-17 11:48:18,3,1,11,0.0,2.0,80589.0,4,381
36461,CostCenter1,Oevel,Ut Consulting,Bulgaria,2093,983425,Compare_of_sums,Server,missing,1,2015-03-17 11:48:18,3,1,11,0.0,2.0,80589.0,5,381


In [11]:
def add_event_nr_case_length(group):
    
    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    group['case_length'] = group.shape[0]
    
    return group

In [12]:
test = test.groupby(case_id_col).apply(add_event_nr_case_length)

In [14]:
test.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,CostCenter.Code,Supplier.City,Supplier.Name,Supplier.State,InvoiceTotalAmountWithoutVAT,Case ID,Activity,Resource,ActivityFinalAction,EventType,Complete Timestamp,month,weekday,hour,timesincelastevent,timesincecasestart,remtime,event_nr,open_cases,case_length
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
988616,62682,CostCenter3,Pozzuolo del Friuli,Pede Industries,Denmark,6699,988616,Shift_to_higher_level,Server,Approved,1,2015-04-07 16:56:19,4,1,16,1.0,170.0,475.0,11,20,15
988616,62683,CostCenter3,Pozzuolo del Friuli,Pede Industries,Denmark,6699,988616,Approving_on_specific_level,Lara Obrien,Approved,2,2015-04-07 16:56:30,4,1,16,11.0,181.0,464.0,12,20,15
988616,62696,CostCenter3,Pozzuolo del Friuli,Pede Industries,Denmark,6699,988616,Check_whether_the_total_approval,Server,Approved,1,2015-04-07 17:04:13,4,1,17,0.0,644.0,1.0,13,17,15
988616,62697,CostCenter3,Pozzuolo del Friuli,Pede Industries,Denmark,6699,988616,Status_change_to_Accounted,Server,Approved,1,2015-04-07 17:04:13,4,1,17,463.0,644.0,1.0,14,17,15
988616,62698,CostCenter3,Pozzuolo del Friuli,Pede Industries,Denmark,6699,988616,Process_end,Server,Approved,1,2015-04-07 17:04:14,4,1,17,1.0,645.0,0.0,15,17,15


In [17]:
for prefix in range(min_prefix_length, max_prefix_length+1):
    print(prefix)
    long_cases = test[test["case_length"] > prefix]
    longer_cases_this_length = long_cases[long_cases["event_nr"] <= prefix]
    longer_cases_this_length.to_csv("../formulas/%s/test_len_%s.csv" % (logName, prefix), index=False)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [16]:
test.groupby("case_length")[case_id_col].nunique()

case_length
10      41
11    1008
12     664
15     234
18      74
21      28
Name: Case ID, dtype: int64