In [93]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN, Input, BatchNormalization
from tensorflow.keras.utils import get_file
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [183]:
from __future__ import print_function, division
from collections import Counter
import csv
import numpy as np
import random
import sys
import os
import copy
import time
from datetime import datetime,timedelta
from math import log, sqrt
import pandas as pd
import distance
from keras.models import load_model
import matplotlib.pyplot as plt
from jellyfish._jellyfish import damerau_levenshtein_distance

# 1.Read file

In [194]:
eventlog = "preprocessed_bpi13_lowV.csv"
eventlog_name = "bpi13"

In [195]:
csvfile = open('../data/%s' % eventlog, 'r')
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
next(spamreader, None)  # skip the headers
ascii_offset = 161

# 2.Preprocessing

In [196]:
lines = [] #these are all the activity seq
timeseqs = [] #time sequences (differences between two events)
timeseqs2 = [] #time sequences (differences between the current and first)

#helper variables
lastcase = ''
line = ''
firstLine = True
times = []
times2 = []
numlines = 0
casestarttime = None
lasteventtime = None

In [197]:
for row in spamreader: #the rows are "CaseID,ActivityID,CompleteTimestamp"
    t = datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S") #creates a datetime object from row[2]
    if row[0]!=lastcase:  #'lastcase' is to save the last executed case for the loop
        casestarttime = t
        lasteventtime = t
        lastcase = row[0]
        if not firstLine:
            lines.append(line)
            timeseqs.append(times)
            timeseqs2.append(times2)
        line = ''
        times = []
        times2 = []
        numlines+=1
    line+=chr(int(row[1])+ascii_offset)
    timesincelastevent = t - lasteventtime
    timesincecasestart = t - casestarttime
    timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds  #time b/t current and last event
    timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds #time b/t current and starting event
    times.append(timediff)
    times2.append(timediff2)
    lasteventtime = t
    firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
numlines+=1

In [199]:
# Example of accessing processed data
#print(lines)
#print(timeseqs)
#print(timeseqs2)

In [200]:
#average time between events
divisor = np.mean([item for sublist in timeseqs for item in sublist]) 
print('divisor: {}'.format(divisor))
#average time between current and starting events
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) 
print('divisor2: {}'.format(divisor2))


divisor: 115683.72903116817
divisor2: 521824.4638905313


In [201]:
# separate training data into 3 parts
elems_per_fold = int(round(numlines/3)) #calculate the number of elements per fold
# fist 1/3 elements and their calculated time features
fold1 = lines[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]
# second 1/3 elements and their calculated time features
fold2 = lines[elems_per_fold:2*elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2*elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2*elems_per_fold]
# last 1/3 elements and their calculated time features
fold3 = lines[2*elems_per_fold:]
fold3_t = timeseqs[2*elems_per_fold:]
fold3_t2 = timeseqs2[2*elems_per_fold:]

#consider only fist and second part as training set, leave away fold3 for now
lines = fold1 + fold2
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2

In [202]:
step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x+ '!',lines)) #add a delimiter symbol '!' to the end of each line
maxlen = max(map(lambda x: len(x),lines)) #find maximum line size

# next lines here to get all possible characters for events and annotate them with numbers
chars = list(map(lambda x: set(x),lines))
chars = list(set().union(*chars))
chars.sort()

In [203]:
target_chars = copy.copy(chars)

if '!' in chars:
    chars.remove('!')
print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))

total chars: 10, target chars: 11


In [204]:
#get the target chars from the training set
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
print(indices_char)

{0: '¢', 1: '£', 2: '¤', 3: '¥', 4: '¦', 5: '§', 6: '¨', 7: '©', 8: 'ª', 9: '«'}


# 3. Feature Enginnering

In [205]:
csvfile = open('../data/%s' % eventlog, 'r')
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
next(spamreader, None)  # skip the headers
lastcase = ''
line = ''
firstLine = True
lines = []
timeseqs = []
timeseqs2 = []
timeseqs3 = []
timeseqs4 = []
times = []
times2 = []
times3 = []
times4 = []
numlines = 0
casestarttime = None
lasteventtime = None

In [206]:
for row in spamreader:
    t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
    if row[0]!=lastcase:
        casestarttime = t
        lasteventtime = t
        lastcase = row[0]
        if not firstLine:
            lines.append(line)
            timeseqs.append(times)
            timeseqs2.append(times2)
            timeseqs3.append(times3)
            timeseqs4.append(times4)
        line = ''
        times = []
        times2 = []
        times3 = []
        times4 = []
        numlines+=1
    line+=chr(int(row[1])+ascii_offset)
    timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
    timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
    midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
    timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
    timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
    timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
    timediff3 = timesincemidnight.seconds #this leaves only time even occurred after midnight
    timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday() #day of the week
    times.append(timediff)
    times2.append(timediff2)
    times3.append(timediff3)
    times4.append(timediff4)
    lasteventtime = t
    firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
timeseqs4.append(times4)
numlines+=1

In [207]:
# fold 1
elems_per_fold = int(round(numlines/3)) #calculate the number of elements per fold
fold1 = lines[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]
fold1_t3 = timeseqs3[:elems_per_fold]
fold1_t4 = timeseqs4[:elems_per_fold]
with open(f'output_files/folds/{eventlog_name}_fold1.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold1, fold1_t):
        spamwriter.writerow([str(s) + '#{}'.format(t) for s, t in zip(row, timeseq)])
        
# fold 2
fold2 = lines[elems_per_fold:2*elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2*elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2*elems_per_fold]
fold2_t3 = timeseqs3[elems_per_fold:2*elems_per_fold]
fold2_t4 = timeseqs4[elems_per_fold:2*elems_per_fold]
with open(f'output_files/folds/{eventlog_name}_fold2.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold2, fold2_t):
        spamwriter.writerow([str(s) +'#{}'.format(t) for s, t in zip(row, timeseq)])
        
# fold 3
fold3 = lines[2*elems_per_fold:]
fold3_t = timeseqs[2*elems_per_fold:]
fold3_t2 = timeseqs2[2*elems_per_fold:]
fold3_t3 = timeseqs3[2*elems_per_fold:]
fold3_t4 = timeseqs4[2*elems_per_fold:]
with open(f'output_files/folds/{eventlog_name}_fold3.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold3, fold3_t):
        spamwriter.writerow([str(s) +'#{}'.format(t) for s, t in zip(row, timeseq)])
        
lines = fold1 + fold2
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2
lines_t3 = fold1_t3 + fold2_t3
lines_t4 = fold1_t4 + fold2_t4

In [211]:
step = 1
sentences = []
softness = 0
next_chars = []
lines = list(map(lambda x: x+'!', lines))

sentences_t = []
sentences_t2 = []
sentences_t3 = []
sentences_t4 = []
next_chars_t = []
next_chars_t2 = []
next_chars_t3 = []
next_chars_t4 = []

for line, line_t, line_t2, line_t3, line_t4 in zip(lines, lines_t, lines_t2, lines_t3, lines_t4):
    for i in range(0, len(line), step):
        if i==0:
            continue

        #we add iteratively, first symbol of the line, then two first, three...
        sentences.append(line[0: i])
        sentences_t.append(line_t[0:i])
        sentences_t2.append(line_t2[0:i])
        sentences_t3.append(line_t3[0:i])
        sentences_t4.append(line_t4[0:i])
        next_chars.append(line[i])

        if i==len(line)-1: # special case to deal time of end character
            next_chars_t.append(0)
            next_chars_t2.append(0)
            next_chars_t3.append(0)
            next_chars_t4.append(0)
        else:
            next_chars_t.append(line_t[i])
            next_chars_t2.append(line_t2[i])
            next_chars_t3.append(line_t3[i])
            next_chars_t4.append(line_t4[i])

print('nb sequences:', len(sentences))

nb sequences: 22263


# 4. Encoding

In [212]:
print('Vectorization...')
num_features = len(chars)+5
print('num features: {}'.format(num_features))
X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
y_a = np.zeros((len(sentences), len(target_chars)), dtype=np.float32)
y_t = np.zeros((len(sentences)), dtype=np.float32)
for i, sentence in enumerate(sentences):
    leftpad = maxlen-len(sentence)
    next_t = next_chars_t[i]
    sentence_t = sentences_t[i]
    sentence_t2 = sentences_t2[i]
    sentence_t3 = sentences_t3[i]
    sentence_t4 = sentences_t4[i]
    for t, char in enumerate(sentence):
        multiset_abstraction = Counter(sentence[:t+1])
        for c in chars:
            if c==char: #this will encode present events to the right places
                X[i, t+leftpad, char_indices[c]] = 1
        X[i, t+leftpad, len(chars)] = t+1
        X[i, t+leftpad, len(chars)+1] = sentence_t[t]/divisor
        X[i, t+leftpad, len(chars)+2] = sentence_t2[t]/divisor2
        X[i, t+leftpad, len(chars)+3] = sentence_t3[t]/86400
        X[i, t+leftpad, len(chars)+4] = sentence_t4[t]/7
    for c in target_chars:
        if c==next_chars[i]:
            y_a[i, target_char_indices[c]] = 1-softness
        else:
            y_a[i, target_char_indices[c]] = softness/(len(target_chars)-1)
    y_t[i] = next_t/divisor
    np.set_printoptions(threshold=sys.maxsize)

Vectorization...
num features: 15


# 5. Train LSTM

In [213]:
from tensorflow.keras.optimizers import legacy

In [214]:
print('Build model...')
main_input = Input(shape=(maxlen, num_features), name='main_input')

Build model...


In [215]:
# train a 2-layer LSTM with one shared layer
l1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
b1 = BatchNormalization()(l1)
l2_1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in activity prediction
b2_1 = BatchNormalization()(l2_1)
l2_2 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
act_output = Dense(len(target_chars), activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)

In [216]:
model = Model(inputs=[main_input], outputs=[act_output, time_output])

In [217]:
opt = legacy.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.004, clipvalue=3)
model.compile(loss={'act_output':'categorical_crossentropy', 'time_output':'mae'}, optimizer=opt)
early_stopping = EarlyStopping(monitor='val_act_output_loss', patience=42)
model_checkpoint = ModelCheckpoint('output_files/models/LSTM_model_{epoch:02d}-{val_act_output_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
lr_reducer = ReduceLROnPlateau(monitor='val_act_output_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

In [218]:
model.fit(X, {'act_output':y_a, 'time_output':y_t}, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, epochs=500)

Epoch 1/500
1370/1370 - 45s - loss: 1.5789 - act_output_loss: 0.6946 - time_output_loss: 0.8843 - val_loss: 2.1651 - val_act_output_loss: 1.2603 - val_time_output_loss: 0.9049 - lr: 0.0020 - 45s/epoch - 33ms/step
Epoch 2/500
1370/1370 - 37s - loss: 1.3383 - act_output_loss: 0.5451 - time_output_loss: 0.7932 - val_loss: 2.0873 - val_act_output_loss: 1.1970 - val_time_output_loss: 0.8903 - lr: 0.0020 - 37s/epoch - 27ms/step
Epoch 3/500
1370/1370 - 40s - loss: 1.3239 - act_output_loss: 0.5294 - time_output_loss: 0.7945 - val_loss: 2.2731 - val_act_output_loss: 1.3433 - val_time_output_loss: 0.9297 - lr: 0.0020 - 40s/epoch - 29ms/step
Epoch 4/500
1370/1370 - 38s - loss: 1.2987 - act_output_loss: 0.5212 - time_output_loss: 0.7775 - val_loss: 2.2928 - val_act_output_loss: 1.3910 - val_time_output_loss: 0.9019 - lr: 0.0020 - 38s/epoch - 28ms/step
Epoch 5/500
1370/1370 - 39s - loss: 1.2901 - act_output_loss: 0.5193 - time_output_loss: 0.7708 - val_loss: 2.2558 - val_act_output_loss: 1.4229 - v

1370/1370 - 41s - loss: 1.1755 - act_output_loss: 0.4692 - time_output_loss: 0.7063 - val_loss: 2.7810 - val_act_output_loss: 1.8537 - val_time_output_loss: 0.9273 - lr: 2.5000e-04 - 41s/epoch - 30ms/step
Epoch 40/500
1370/1370 - 41s - loss: 1.1797 - act_output_loss: 0.4740 - time_output_loss: 0.7057 - val_loss: 2.7972 - val_act_output_loss: 1.8611 - val_time_output_loss: 0.9361 - lr: 2.5000e-04 - 41s/epoch - 30ms/step
Epoch 41/500
1370/1370 - 40s - loss: 1.1687 - act_output_loss: 0.4681 - time_output_loss: 0.7005 - val_loss: 2.7208 - val_act_output_loss: 1.8074 - val_time_output_loss: 0.9133 - lr: 2.5000e-04 - 40s/epoch - 29ms/step
Epoch 42/500
1370/1370 - 41s - loss: 1.1700 - act_output_loss: 0.4724 - time_output_loss: 0.6976 - val_loss: 2.7417 - val_act_output_loss: 1.8302 - val_time_output_loss: 0.9114 - lr: 2.5000e-04 - 41s/epoch - 30ms/step
Epoch 43/500
1370/1370 - 38s - loss: 1.1732 - act_output_loss: 0.4689 - time_output_loss: 0.7043 - val_loss: 2.7291 - val_act_output_loss: 1.

<keras.callbacks.History at 0x20b856032b0>