In [64]:
import pandas as pd
import numpy as np
import os

In [65]:
input_data_folder = "../experiments/labeled_logs_csv/"
output_data_folder = "../experiments/logdata/"
filename = "Production_Data.csv"

In [66]:
case_id_col = "Case ID"
activity_col = "Activity"
resource_col = "Resource"
timestamp_col = "Complete Timestamp"

In [67]:
freq_threshold = 10

In [68]:
# features for classifier
static_cat_cols = ["Part Desc"]
static_num_cols = ["Work Order Qty"]
dynamic_cat_cols = [activity_col, resource_col, "Report Type", "Worker ID"]
dynamic_num_cols = ["Qty Completed", "Qty for MRB", "activity_duration"]

In [69]:
static_cols = static_cat_cols + static_num_cols + [case_id_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [70]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp = tmp.fillna(0)
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(0)
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))
    
    tmp = group[timestamp_col].iloc[0] - group[timestamp_col]
    tmp = tmp.fillna(0)
    group["remtime"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group

In [71]:
def get_open_cases(date):
    return sum((dt_first_last_timestamps["start_time"] <= date) & (dt_first_last_timestamps["end_time"] > date))


In [72]:
data = pd.read_csv(os.path.join(input_data_folder,filename), sep=",")

In [73]:
# add event duration
data["Complete Timestamp"] = pd.to_datetime(data["Complete Timestamp"])
data["Start Timestamp"] = pd.to_datetime(data["Start Timestamp"])
tmp = data["Complete Timestamp"] - data["Start Timestamp"]
tmp = tmp.fillna(0)
data["activity_duration"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 's')))

In [None]:
def assign_label(group):
    tmp = group["Qty Rejected"] > 0
    tmp = tmp.reset_index()["Qty Rejected"]
    if sum(tmp) > 0:
        idx = tmp[tmp==True].index[0]
        group = group.iloc[:idx,:]
        group[label_col] = pos_label
    else:
        group[label_col] = neg_label
    return group

In [61]:
# assign labels
label_col = "label"
pos_label = "deviant"
neg_label = "regular"
#data = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col).apply(assign_label)

In [74]:
data = data[static_cols + dynamic_cols]

In [75]:
data.head(3)

Unnamed: 0,Part Desc,Work Order Qty,Case ID,Activity,Resource,Report Type,Worker ID,Qty Completed,Qty for MRB,activity_duration,Complete Timestamp
0,Cable Head,10,Case 1,Turning & Milling - Machine 4,Machine 4 - Turning & Milling,S,ID4932,1,0,22740.0,2012-01-30 05:43:00
1,Cable Head,10,Case 1,Turning & Milling - Machine 4,Machine 4 - Turning & Milling,D,ID4932,1,0,3480.0,2012-01-30 06:42:00
2,Cable Head,10,Case 1,Turning & Milling - Machine 4,Machine 4 - Turning & Milling,S,ID4167,0,0,1320.0,2012-01-30 07:21:00


In [76]:
# add features extracted from timestamp
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour
data = data.groupby(case_id_col).apply(extract_timestamp_features)

In [77]:
# add inter-case features
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].apply(get_open_cases)

Defaulting to column, but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


In [78]:
# impute missing values
grouped = data.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
for col in static_cols + dynamic_cols:
    data[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))

data[cat_cols] = data[cat_cols].fillna('missing')
data = data.fillna(0)

Defaulting to column, but this will raise an ambiguity error in a future version
  


In [79]:
# set infrequent factor levels to "other"
for col in cat_cols:
    counts = data[col].value_counts()
    mask = data[col].isin(counts[counts >= freq_threshold].index)
    data.loc[~mask, col] = "other"

In [80]:
data.to_csv(os.path.join(output_data_folder,filename), sep=";", index=False)