In [42]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter

input_data_folder = "../logdata/orig/"
output_data_folder = "../logdata/"
in_filename = "CreditRequirement.csv"

In [43]:
case_id_col = "case_id"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"

In [44]:
category_freq_threshold = 10

In [45]:
data = pd.read_csv(os.path.join(input_data_folder, in_filename), sep=",")

In [46]:
columns_to_remove = ["Variant", "Variant index", "Resource"]

In [47]:
data = data.drop(columns_to_remove, axis =1)

In [48]:
data.head()

Unnamed: 0,case_id,Activity,Start Timestamp,Complete Timestamp
0,1,Register,2014/04/02 08:00:48.000,2014/04/02 08:00:48.000
1,1,Acceptance of requests,2014/04/02 08:00:48.000,2014/04/02 08:18:43.000
2,1,Collection of documents,2014/04/02 08:18:43.000,2014/04/02 09:47:48.000
3,1,Completeness check,2014/04/02 09:47:48.000,2014/04/02 11:05:04.000
4,1,Credit worthiness check,2014/04/02 11:05:04.000,2014/04/02 12:40:06.000


In [49]:
# features for classifier
dynamic_cat_cols = ["Activity"] # i.e. event attributes
static_cat_cols = []
static_num_cols = []
dynamic_num_cols = ["activity_duration"] # i.e. case attributes that are known from the start

In [50]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [51]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(0)
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(0)
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes
    
    return group


def get_open_cases(date):
    return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))  
    

In [52]:
data["Complete Timestamp"] = pd.to_datetime(data["Complete Timestamp"])

In [53]:
data["Start Timestamp"] = pd.to_datetime(data["Start Timestamp"])

In [54]:
tmp = data["Complete Timestamp"] - data["Start Timestamp"]

In [55]:
tmp = tmp.fillna(0)

In [56]:
data["activity_duration"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm')))

In [79]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,1,Register,0.0,2014-04-02 08:00:48,480,4,2,8,0.0,1
2,8,2,Register,0.0,2014-04-02 08:01:32,481,4,2,8,0.0,2
3,16,3,Register,0.0,2014-04-02 08:07:17,487,4,2,8,0.0,3
4,24,4,Register,0.0,2014-04-02 08:08:30,488,4,2,8,0.0,4
5,32,5,Register,0.0,2014-04-02 08:14:25,494,4,2,8,0.0,5


In [58]:
data = data[static_cols + dynamic_cols]

In [59]:
data.head()

Unnamed: 0,case_id,Activity,activity_duration,Complete Timestamp
0,1,Register,0.0,2014-04-02 08:00:48
1,1,Acceptance of requests,17.916667,2014-04-02 08:18:43
2,1,Collection of documents,89.083333,2014-04-02 09:47:48
3,1,Completeness check,77.266667,2014-04-02 11:05:04
4,1,Credit worthiness check,95.033333,2014-04-02 12:40:06


In [60]:
# add features extracted from timestamp
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["timesincemidnight"] = data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)

In [61]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincelastevent,timesincecasestart
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,7,1,Requirements review,12.083333,2014-04-02 14:29:46,869,4,2,14,12.083333,388.966667
1,6,1,Credit committee,42.1,2014-04-02 14:17:41,857,4,2,14,42.1,376.883333
1,5,1,Collateral check,55.483333,2014-04-02 13:35:35,815,4,2,13,55.483333,334.783333
1,4,1,Credit worthiness check,95.033333,2014-04-02 12:40:06,760,4,2,12,95.033333,279.3
1,3,1,Completeness check,77.266667,2014-04-02 11:05:04,665,4,2,11,77.266667,184.266667


In [62]:
# add inter-case features
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(get_open_cases)



Defaulting to column but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


In [63]:
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))
        
data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)    


Defaulting to column but this will raise an ambiguity error in a future version
  


In [77]:
data.to_csv(os.path.join(output_data_folder, in_filename), sep=";", index=False)

In [71]:
data = data.drop("timesincelastevent", axis =1)

In [75]:
data = data.drop("bil", axis =1)

In [78]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,case_id,Activity,activity_duration,Complete Timestamp,timesincemidnight,month,weekday,hour,timesincecasestart,open_cases
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,1,Register,0.0,2014-04-02 08:00:48,480,4,2,8,0.0,1
2,8,2,Register,0.0,2014-04-02 08:01:32,481,4,2,8,0.0,2
3,16,3,Register,0.0,2014-04-02 08:07:17,487,4,2,8,0.0,3
4,24,4,Register,0.0,2014-04-02 08:08:30,488,4,2,8,0.0,4
5,32,5,Register,0.0,2014-04-02 08:14:25,494,4,2,8,0.0,5


0.0    80280
Name: bil, dtype: int64