In [1]:
logName = "BPI2012A"

In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv("../logdata/%s.csv" % logName, sep=";")
case_id_col = 'case_id'
timestamp_col = "time"

In [3]:
data.shape

(53952, 8)

In [4]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])

In [5]:
def split_data(data, train_ratio):
    # split into train and test using temporal split

    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(np.ceil(train_ratio*len(start_timestamps)))]
    train = data[data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')

    return (train, test)

In [6]:
_, test = split_data(data, train_ratio=0.660)

In [7]:
test.shape

(17579, 8)

In [8]:
min_prefix_length = 2
max_prefix_length = 7

In [10]:
test[test[case_id_col]==6625]

Unnamed: 0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases,remtime
52999,6625,Register,0.0,2014-07-10 12:56:07,776,7,3,12,0.0,76,1349.716667
52998,6625,Acceptance_of_requests,16.366667,2014-07-10 13:12:29,792,7,3,13,16.366667,73,1333.35
52997,6625,Collection_of_documents,89.883333,2014-07-10 14:42:22,882,7,3,14,106.25,69,1243.466667
52996,6625,Completeness_check,60.9,2014-07-10 15:43:16,943,7,3,15,167.15,64,1182.566667
52995,6625,Credit_worthiness_check,1063.616667,2014-07-11 09:26:53,566,7,4,9,1230.766667,70,118.95
52994,6625,Collateral_check,66.983333,2014-07-11 10:33:52,633,7,4,10,1297.75,77,51.966667
52993,6625,Credit_committee,45.983333,2014-07-11 11:19:51,679,7,4,11,1343.733333,72,5.983333
52992,6625,Requirements_review,5.983333,2014-07-11 11:25:50,685,7,4,11,1349.716667,70,0.0


In [9]:
def add_event_nr_case_length(group):
    
    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    group['case_length'] = group.shape[0]
    
    return group

In [10]:
test = test.groupby(case_id_col).apply(add_event_nr_case_length)

In [12]:
test.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,case_id,activity_name,Resource,AMOUNT_REQ,elapsed,time,remtime,open_cases,event_nr,case_length
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
199231,36373,199231,A_SUBMITTED,112,25000,0.0,2012-01-09 11:53:59,5131.0,756,1,3
199231,36374,199231,A_PARTLYSUBMITTED,112,25000,0.0,2012-01-09 11:53:59,5131.0,756,2,3
199231,36375,199231,A_DECLINED,11181,25000,5131.0,2012-01-09 13:19:30,0.0,746,3,3
199234,36376,199234,A_SUBMITTED,112,600,0.0,2012-01-09 12:10:12,38.0,755,1,3
199234,36377,199234,A_PARTLYSUBMITTED,112,600,0.0,2012-01-09 12:10:12,38.0,755,2,3
199234,36378,199234,A_DECLINED,112,600,38.0,2012-01-09 12:10:50,0.0,754,3,3
199237,36379,199237,A_SUBMITTED,112,600,0.0,2012-01-09 12:15:50,35.0,754,1,3
199237,36380,199237,A_PARTLYSUBMITTED,112,600,0.0,2012-01-09 12:15:50,35.0,754,2,3
199237,36381,199237,A_DECLINED,112,600,35.0,2012-01-09 12:16:25,0.0,753,3,3
199240,36382,199240,A_SUBMITTED,112,600,0.0,2012-01-09 12:17:04,38.0,754,1,3


In [13]:
for prefix in range(min_prefix_length, max_prefix_length+1):
    print(prefix)
    long_cases = test[test["case_length"] > prefix]
    longer_cases_this_length = long_cases[long_cases["event_nr"] <= prefix]
    longer_cases_this_length.to_csv("../formulas/%s/test_len_%s.csv" % (logName, prefix), index=False)

2
3
4
5
6
7


In [14]:
test.groupby("case_length")[case_id_col].nunique()

case_length
3    2053
4     702
5     278
6     585
8     464
Name: case_id, dtype: int64