In [1]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter

input_data_folder = "../logdata/orig/"
output_data_folder = "../logdata/"
in_filename = "Sepsis Cases - Event Log.csv"

In [23]:
case_id_col = "case_id"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"

In [24]:
category_freq_threshold = 10

In [3]:
# features for classifier
dynamic_cat_cols = ["Activity", 'org:group'] # i.e. event attributes
static_cat_cols = ['Diagnose', 'DiagnosticArtAstrup', 'DiagnosticBlood', 'DiagnosticECG',
       'DiagnosticIC', 'DiagnosticLacticAcid', 'DiagnosticLiquor',
       'DiagnosticOther', 'DiagnosticSputum', 'DiagnosticUrinaryCulture',
       'DiagnosticUrinarySediment', 'DiagnosticXthorax', 'DisfuncOrg',
       'Hypotensie', 'Hypoxie', 'InfectionSuspected', 'Infusion', 'Oligurie',
       'SIRSCritHeartRate', 'SIRSCritLeucos', 'SIRSCritTachypnea',
       'SIRSCritTemperature', 'SIRSCriteria2OrMore'] # i.e. case attributes that are known from the start
dynamic_num_cols = ['CRP', 'LacticAcid', 'Leucocytes']
static_num_cols = ['Age']

In [5]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [6]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(0)
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(0)
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group


def get_open_cases(date):
    return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))


def check_if_any_of_activities_exist(group, activities):
    if np.sum(group[activity_col].isin(activities)) > 0:
        return True
    else:
        return False
    
    

In [7]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename), sep=",")

In [8]:
data.head()

Unnamed: 0,case_id,Activity,Complete Timestamp,Variant,Variant index,Age,CRP,Diagnose,DiagnosticArtAstrup,DiagnosticBlood,...,LacticAcid,Leucocytes,Oligurie,SIRSCritHeartRate,SIRSCritLeucos,SIRSCritTachypnea,SIRSCritTemperature,SIRSCriteria2OrMore,lifecycle:transition,org:group
0,A,ER Registration,2014/10/22 09:15:41.000,Variant 63,63,85.0,,A,True,True,...,,,False,True,False,True,True,True,complete,A
1,A,Leucocytes,2014/10/22 09:27:00.000,Variant 63,63,,,,,,...,,9.6,,,,,,,complete,B
2,A,CRP,2014/10/22 09:27:00.000,Variant 63,63,,21.0,,,,...,,,,,,,,,complete,B
3,A,LacticAcid,2014/10/22 09:27:00.000,Variant 63,63,,,,,,...,2.2,,,,,,,,complete,B
4,A,ER Triage,2014/10/22 09:33:37.000,Variant 63,63,,,,,,...,,,,,,,,,complete,C


In [9]:
data.shape

(15214, 34)

In [10]:
data[case_id_col] = data[case_id_col].fillna("missing_caseid")

In [11]:
# remove incomplete cases
tmp = data.groupby(case_id_col).apply(check_if_any_of_activities_exist, activities=["Release A", "Release B", "Release C", "Release D", "Release E"])
incomplete_cases = tmp.index[tmp==False]
data = data[~data[case_id_col].isin(incomplete_cases)]


In [12]:
data = data[static_cols + dynamic_cols]

In [13]:
# add features extracted from timestamp
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["timesincemidnight"] = data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)


In [14]:
# add inter-case features
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(get_open_cases)



Defaulting to column but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))
        
data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)    


Defaulting to column but this will raise an ambiguity error in a future version
  


In [16]:
# set infrequent factor levels to "other"
for col in cat_cols:
    counts = data[col].value_counts()
    mask = data[col].isin(counts[counts >= category_freq_threshold].index)
    data.loc[~mask, col] = "other"
    

In [None]:
# second labeling
dt_labeled = data.sort_values(timestamp_col, ascending=True, kind="mergesort").groupby(case_id_col).apply(check_if_activity_exists, activity="Admission IC")


In [19]:
data.to_csv(os.path.join(output_data_folder, "sepsis_cases.csv"), sep=";", index=False)

In [21]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Diagnose,DiagnosticArtAstrup,DiagnosticBlood,DiagnosticECG,DiagnosticIC,DiagnosticLacticAcid,DiagnosticLiquor,DiagnosticOther,DiagnosticSputum,DiagnosticUrinaryCulture,...,Leucocytes,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincelastevent,timesincecasestart,event_nr,open_cases
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
XJ,3833,C,False,True,True,True,True,False,False,False,True,...,0.0,2013-11-07 07:18:29,438,11,3,7,0.0,0.0,1,1
XJ,3834,C,False,True,True,True,True,False,False,False,True,...,0.0,2013-11-07 07:29:18,449,11,3,7,10.816667,10.816667,2,1
XJ,3835,C,False,True,True,True,True,False,False,False,True,...,0.0,2013-11-07 07:37:32,457,11,3,7,8.233333,19.05,3,1
XJ,3836,C,False,True,True,True,True,False,False,False,True,...,0.0,2013-11-07 07:51:00,471,11,3,7,0.0,32.516667,4,1
XJ,3837,C,False,True,True,True,True,False,False,False,True,...,296.2,2013-11-07 07:51:00,471,11,3,7,0.0,32.516667,5,1


In [20]:
data.shape

(13422, 39)

In [22]:
data.columns

Index([u'Diagnose', u'DiagnosticArtAstrup', u'DiagnosticBlood',
       u'DiagnosticECG', u'DiagnosticIC', u'DiagnosticLacticAcid',
       u'DiagnosticLiquor', u'DiagnosticOther', u'DiagnosticSputum',
       u'DiagnosticUrinaryCulture', u'DiagnosticUrinarySediment',
       u'DiagnosticXthorax', u'DisfuncOrg', u'Hypotensie', u'Hypoxie',
       u'InfectionSuspected', u'Infusion', u'Oligurie', u'SIRSCritHeartRate',
       u'SIRSCritLeucos', u'SIRSCritTachypnea', u'SIRSCritTemperature',
       u'SIRSCriteria2OrMore', u'Age', u'case_id', u'Activity', u'org:group',
       u'CRP', u'LacticAcid', u'Leucocytes', u'Complete Timestamp',
       u'timesincemidnight', u'month', u'weekday', u'hour',
       u'timesincelastevent', u'timesincecasestart', u'event_nr',
       u'open_cases'],
      dtype='object')