In [1]:
import pandas as pd
import numpy as np
import os
import sys
from collections import defaultdict, Counter

input_data_folder = "../logdata/orig/"
output_data_folder = "../logdata/"
in_filename = "hospital_billing.csv"

In [2]:
case_id_col = "case_id"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"

In [None]:
category_freq_threshold = 10

In [None]:
# features for classifier
dynamic_cat_cols = ["Activity", 'Resource', 'actOrange', 'actRed', 'blocked', 'caseType', 'diagnosis', 'flagC', 'flagD', 'msgCode', 'msgType', 'state', 'version', 'isCancelled', 'isClosed', 'closeCode'] 
static_cat_cols = ['speciality']
dynamic_num_cols = ['msgCount']
static_num_cols = []

In [None]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [None]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(0)
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(0)
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group

def check_if_any_of_activities_exist(group, activities):
    if np.sum(group[activity_col].isin(activities)) > 0:
        return True
    else:
        return False
    


In [None]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename), sep=";")
data[case_id_col] = data[case_id_col].fillna("missing_caseid")
data.rename(columns=lambda x: x.replace('(case) ', ''), inplace=True)

In [None]:
data.head()

In [None]:
# remove incomplete cases
tmp = data.groupby(case_id_col).apply(check_if_any_of_activities_exist, activities=["BILLED", "DELETE", "FIN"])
incomplete_cases = tmp.index[tmp==False]
data = data[~data[case_id_col].isin(incomplete_cases)]
del tmp

In [None]:
data = data[static_cols + dynamic_cols]

# add features extracted from timestamp
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)



In [None]:
# add inter-case features
print("Extracting open cases...")
sys.stdout.flush()
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
#data["open_cases"] = data[timestamp_col].apply(get_open_cases)
case_end_times = dt_first_last_timestamps.to_dict()["end_time"]

data["open_cases"] = 0
case_dict_state = {}
for idx, row in data.iterrows():
    case = row[case_id_col]
    current_ts = row[timestamp_col]

    # save the state
    data.set_value(idx, 'open_cases', len(case_dict_state))

    if current_ts >= case_end_times[case]:
        if case in case_dict_state:
            del case_dict_state[case]
    else:
        case_dict_state[case] = 1



In [None]:
print("Imputing missing values...")
sys.stdout.flush()
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))
        
data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)
    


In [None]:
# set infrequent factor levels to "other"
for col in cat_cols:
    counts = data[col].value_counts()
    mask = data[col].isin(counts[counts >= category_freq_threshold].index)
    data.loc[~mask, col] = "other"
    
data = data.sort_values(timestamp_col, ascending=True, kind="mergesort")    

In [None]:
data.to_csv(os.path.join(output_data_folder, "hospital_billing.csv"), sep=";", index=False)

In [3]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename), sep=";")

In [None]:
data.head()

In [6]:
def get_case_length(group):
    group['case_length'] = group.shape[0]
    return group

In [7]:
print(data.shape)

(428626, 28)


In [8]:
data = data.groupby(case_id_col).apply(get_case_length)

In [None]:
data = data[data["case_length"] > 2]

In [13]:
data.groupby("case_length")[case_id_col].nunique()

case_length
2       8324
3       1931
4        797
5      22900
6      35125
7       3375
8        919
9       1125
10      1372
11       453
12       239
13       443
14       183
15        90
16        60
17        50
18        37
19        25
20        16
21        13
22        15
23         6
24         3
25         4
26         1
27         3
28         1
29         1
31         1
32         1
33         1
34         2
40         1
41         1
42         1
49         1
52         1
117        1
157        1
176        1
217        1
Name: case_id, dtype: int64

In [14]:
data.shape

(428626, 29)

In [16]:
data2.shape

(411978, 29)

In [22]:
data2 = data[data["case_length"] > 2]

In [20]:
data2 = data2.drop("case_length", axis=1)

In [21]:
data2.to_csv(os.path.join(output_data_folder, in_filename), sep=";", index=False)

In [18]:
(428626 - 411978)/2

8324

In [25]:
data2.groupby("case_length")[case_id_col].nunique()

case_length
3       1931
4        797
5      22900
6      35125
7       3375
8        919
9       1125
10      1372
11       453
12       239
13       443
14       183
15        90
16        60
17        50
18        37
19        25
20        16
21        13
22        15
23         6
24         3
25         4
26         1
27         3
28         1
29         1
31         1
32         1
33         1
34         2
40         1
41         1
42         1
49         1
52         1
117        1
157        1
176        1
217        1
Name: case_id, dtype: int64

In [24]:
data2.shape

(411978, 29)

In [27]:
data2[case_id_col].nunique()

69201