In [2]:
logName = "CreditRequirement"

In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv("../logdata/%s.csv" % logName, sep=",")
case_id_col = 'case_id'
timestamp_col = "Complete Timestamp"

In [4]:
data.head()

Unnamed: 0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases,remtime
0,1,Requirements_review,12.083333,2014-04-02 14:29:46,869,4,2,14,388.966667,67,0.0
1,1,Credit_committee,42.1,2014-04-02 14:17:41,857,4,2,14,376.883333,64,12.083333
2,1,Collateral_check,55.483333,2014-04-02 13:35:35,815,4,2,13,334.783333,61,54.183333
3,1,Credit_worthiness_check,95.033333,2014-04-02 12:40:06,760,4,2,12,279.3,49,109.666667
4,1,Completeness_check,77.266667,2014-04-02 11:05:04,665,4,2,11,184.266667,39,204.7


In [5]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])

In [6]:
def split_data(data, train_ratio):
    # split into train and test using temporal split

    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(np.ceil(train_ratio*len(start_timestamps)))]
    train = data[data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')

    return (train, test)

In [7]:
_, test = split_data(data, train_ratio=0.660)

In [8]:
test.shape

(27288, 11)

In [9]:
min_prefix_length = 2
max_prefix_length = 7

In [10]:
test[test[case_id_col]==6625]

Unnamed: 0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases,remtime
52999,6625,Register,0.0,2014-07-10 12:56:07,776,7,3,12,0.0,76,1349.716667
52998,6625,Acceptance_of_requests,16.366667,2014-07-10 13:12:29,792,7,3,13,16.366667,73,1333.35
52997,6625,Collection_of_documents,89.883333,2014-07-10 14:42:22,882,7,3,14,106.25,69,1243.466667
52996,6625,Completeness_check,60.9,2014-07-10 15:43:16,943,7,3,15,167.15,64,1182.566667
52995,6625,Credit_worthiness_check,1063.616667,2014-07-11 09:26:53,566,7,4,9,1230.766667,70,118.95
52994,6625,Collateral_check,66.983333,2014-07-11 10:33:52,633,7,4,10,1297.75,77,51.966667
52993,6625,Credit_committee,45.983333,2014-07-11 11:19:51,679,7,4,11,1343.733333,72,5.983333
52992,6625,Requirements_review,5.983333,2014-07-11 11:25:50,685,7,4,11,1349.716667,70,0.0


In [11]:
def add_event_nr_case_length(group):
    
    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    group['case_length'] = group.shape[0]
    
    return group


In [12]:
test = test.groupby(case_id_col).apply(add_event_nr_case_length)

In [13]:
test[test[case_id_col]==6625]

Unnamed: 0_level_0,Unnamed: 1_level_0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases,remtime,event_nr,case_length
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6625,52999,6625,Register,0.0,2014-07-10 12:56:07,776,7,3,12,0.0,76,1349.716667,1,8
6625,52998,6625,Acceptance_of_requests,16.366667,2014-07-10 13:12:29,792,7,3,13,16.366667,73,1333.35,2,8
6625,52997,6625,Collection_of_documents,89.883333,2014-07-10 14:42:22,882,7,3,14,106.25,69,1243.466667,3,8
6625,52996,6625,Completeness_check,60.9,2014-07-10 15:43:16,943,7,3,15,167.15,64,1182.566667,4,8
6625,52995,6625,Credit_worthiness_check,1063.616667,2014-07-11 09:26:53,566,7,4,9,1230.766667,70,118.95,5,8
6625,52994,6625,Collateral_check,66.983333,2014-07-11 10:33:52,633,7,4,10,1297.75,77,51.966667,6,8
6625,52993,6625,Credit_committee,45.983333,2014-07-11 11:19:51,679,7,4,11,1343.733333,72,5.983333,7,8
6625,52992,6625,Requirements_review,5.983333,2014-07-11 11:25:50,685,7,4,11,1349.716667,70,0.0,8,8


In [14]:
for prefix in range(min_prefix_length, max_prefix_length+1):
    print(prefix)
    long_cases = test[test["case_length"] > prefix]
    longer_cases_this_length = long_cases[long_cases["event_nr"] <= prefix]
    longer_cases_this_length.to_csv("../formulas/CreditRequirement/test_len_%s.csv" % prefix, index=False)

2
3
4
5
6
7
