In [1]:
logName = "hospital_billing_977"

In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv("../logdata/%s.csv" % logName, sep=";")
case_id_col = 'case_id'
timestamp_col = "Complete Timestamp"

In [8]:
data.shape

(330945, 28)

In [4]:
data[timestamp_col] = pd.to_datetime(data[timestamp_col])

In [5]:
def split_data(data, train_ratio):
    # split into train and test using temporal split

    grouped = data.groupby(case_id_col)
    start_timestamps = grouped[timestamp_col].min().reset_index()
    start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[case_id_col])[:int(np.ceil(train_ratio*len(start_timestamps)))]
    train = data[data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')
    test = data[~data[case_id_col].isin(train_ids)].sort_values(timestamp_col, ascending=True, kind='mergesort')

    return (train, test)

In [6]:
_, test = split_data(data, train_ratio=0.660)

In [7]:
test.shape

(112223, 28)

In [22]:
min_prefix_length = 2
max_prefix_length = 6

In [10]:
test[test[case_id_col]==6625]

Unnamed: 0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases,remtime
52999,6625,Register,0.0,2014-07-10 12:56:07,776,7,3,12,0.0,76,1349.716667
52998,6625,Acceptance_of_requests,16.366667,2014-07-10 13:12:29,792,7,3,13,16.366667,73,1333.35
52997,6625,Collection_of_documents,89.883333,2014-07-10 14:42:22,882,7,3,14,106.25,69,1243.466667
52996,6625,Completeness_check,60.9,2014-07-10 15:43:16,943,7,3,15,167.15,64,1182.566667
52995,6625,Credit_worthiness_check,1063.616667,2014-07-11 09:26:53,566,7,4,9,1230.766667,70,118.95
52994,6625,Collateral_check,66.983333,2014-07-11 10:33:52,633,7,4,10,1297.75,77,51.966667
52993,6625,Credit_committee,45.983333,2014-07-11 11:19:51,679,7,4,11,1343.733333,72,5.983333
52992,6625,Requirements_review,5.983333,2014-07-11 11:25:50,685,7,4,11,1349.716667,70,0.0


In [10]:
def add_event_nr_case_length(group):
    
    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    group['case_length'] = group.shape[0]
    
    return group

In [11]:
test = test.groupby(case_id_col).apply(add_event_nr_case_length)

In [13]:
test[test[case_id_col]=="AABD"]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciality,case_id,Activity,Resource,actOrange,actRed,blocked,caseType,diagnosis,flagC,...,Complete Timestamp,month,weekday,hour,timesincelastevent,timesincecasestart,event_nr,open_cases,remtime,case_length
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AABD,45,H,AABD,NEW,ResSD,missing,missing,False,B,missing,missing,...,2015-01-21 18:38:28,1,2,18,0.0,0.0,1,12690,18063910.0,6
AABD,44,H,AABD,CHANGE_DIAGN,ResYAA,missing,missing,False,B,TNA,missing,...,2015-01-23 13:38:54,1,4,13,2580.433333,2580.433333,2,12657,17909084.0,6
AABD,43,H,AABD,FIN,ResA,missing,missing,False,B,TNA,missing,...,2015-04-24 02:04:40,4,4,2,130345.766667,132926.2,3,12434,10088338.0,6
AABD,42,H,AABD,RELEASE,ResA,missing,missing,False,B,TNA,missing,...,2015-04-24 04:11:21,4,4,4,126.683333,133052.883333,4,12437,10080737.0,6
AABD,41,H,AABD,CODE_OK,ResA,False,False,False,B,TNA,False,...,2015-04-27 04:13:48,4,0,4,4322.45,137375.333333,5,12514,9821390.0,6
AABD,40,H,AABD,BILLED,ResB,False,False,False,B,TNA,False,...,2015-08-18 20:23:38,8,1,20,163689.833333,301065.166667,6,14312,0.0,6


In [15]:
for prefix in range(min_prefix_length, max_prefix_length+1):
    print(prefix)
    long_cases = test[test["case_length"] > prefix]
    longer_cases_this_length = long_cases[long_cases["event_nr"] <= prefix]
    longer_cases_this_length.to_csv("../formulas/%s/test_len_%s.csv" % (logName, prefix), index=False)

2
3
4
5
6
7


In [21]:
test.groupby("case_length")[case_id_col].nunique()

case_length
3      654
5     7345
6    11430
7      708
Name: case_id, dtype: int64