In [32]:
from __future__ import print_function, division
import pandas as pd
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Dense, Input, BatchNormalization
from tensorflow.keras.utils import get_file
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from collections import Counter
import unicodecsv
import numpy as np
import random
import sys
import os
import copy
import csv
from math import log
from sklearn import preprocessing
pd.set_option('display.max_rows', None)

#read in original data
df_orig = pd.read_csv('data/event_data_alexander.parquet')

In [10]:
#rename the column names
df_orig.rename(columns={'concept:name': 'ActivityID', 'batch_id': 'CaseID', 'timestamp': 'CompleteTimestamp'}, inplace=True)

#only keep the relevant columns
df_orig = df_orig[['CaseID','ActivityID','CompleteTimestamp']]

#labelencode the activity labels
le_act = preprocessing.LabelEncoder()
df_orig['ActivityID'] = le_act.fit_transform(df_orig['ActivityID'])+1

#labelencode the case IDs
le_case = preprocessing.LabelEncoder()
df_orig['CaseID'] = le_case.fit_transform(df_orig['CaseID'])+1

#remove the milliseconds
df_orig['CompleteTimestamp'] = df_orig['CompleteTimestamp'].str[:-4]

#take a sample of 5 cases
#df_orig = df_orig[df_orig['CaseID'].isin([1,2,3,4,5])]
df_orig['CaseID'].nunique()

#sort values by CaseID
df_orig = df_orig.sort_values(['CaseID','CompleteTimestamp'])


#save and load again
df_orig.to_csv('data/event_data_yannis.csv', index=False)
df_new = pd.read_csv('data/event_data_alexander_rounded.csv')
print('the shape of the data',df_new.shape)

the shape of the data (10842, 3)


In [7]:
le_act.classes_

array(['1Pass_Prewet', 'Adjustment Charging', 'Agitation',
       'Agitator start', 'Agitator stop', 'Bottling', 'Charging',
       'Circulation', 'Circulation Prewet', 'Filters',
       'Line Loss Collection', 'Postwash', 'Pump adjustment',
       'Pump start', 'Pump stop'], dtype=object)

In [2]:
print(df_new['CaseID'].nunique())

174


In [4]:
df_new.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,1,10,2020-10-14 13:00:57
1,1,1,2020-10-14 13:16:00
2,1,10,2020-10-14 13:38:51
3,1,1,2020-10-14 16:43:32
4,1,9,2020-10-14 17:59:29


In [5]:
set(df_orig['ActivityID'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [26]:
from datetime import datetime
import time

#define the maximum length of events
def find_max_list(lst):
    list_len = [len(i) for i in lst]
    return max(list_len)


########################################################################################
#this basically creates lists of lists with all the relevant information
########################################################################################

eventlog = 'event_data_alexander_rounded.csv'
csvfile = open('data/%s' % eventlog, 'r')
datareader = csv.reader(csvfile, delimiter=',', quotechar='|')

#helper variables
lastcase = ''
firstLine = True
lineseq = []
timeseqs = []
timeseqs2 = []
timeseqs3 = []
timeseqs4 = []
lines = []
times = []
times2 = []
times3 = []
times4 = []
numlines = 0
casestarttime = None
lasteventtime = None

next(datareader, None)  # skip the headers
for row in datareader: #the columns are "CaseID,ActivityID,CompleteTimestamp"
    t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") #creates a datetime object from row[2]
    if row[0]!=lastcase:  #'lastcase' is to save the last executed case for the loop
        casestarttime = t
        lasteventtime = t
        lastcase = row[0]
        if not firstLine:
            lineseq.append(lines)
            timeseqs.append(times)
            timeseqs2.append(times2)
            timeseqs3.append(times3)
            timeseqs4.append(times4)
        lines = []
        times = []
        times2 = []
        times3 = []
        times4 = []
        numlines+=1
    timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
    timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
    midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
    timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
    timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds     #multiply with 60*60*24 = 86400 to go from days to seconds
    timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds    #the .seconds method gives the time in seconds
    timediff3 = timesincemidnight.seconds #this leaves only time event occured after midnight
    timediff4 = datetime.fromtimestamp(time.mktime(t)).weekday() #day of the week
    lines.append(str(row[1])) #add the activity label to the line list
    times.append(timediff)
    times2.append(timediff2)
    times3.append(timediff3)
    times4.append(timediff4)
    lasteventtime = t
    firstLine = False

# add last case
lineseq.append(lines)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
timeseqs4.append(times4)
numlines+=1

divisor = np.mean([item for sublist in timeseqs for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))

#########################################################################################################
# separate data into 3 parts
elems_per_fold = int(round(numlines/3))
fold1 = lineseq[:elems_per_fold]
fold1_t = timeseqs[:elems_per_fold]
fold1_t2 = timeseqs2[:elems_per_fold]
fold1_t3 = timeseqs3[:elems_per_fold]
fold1_t4 = timeseqs4[:elems_per_fold]
import csv
with open('code/output_files/folds/fold1.csv', 'w', newline='') as csvfile:
    datawriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold1, fold1_t):
        datawriter.writerow([f"{s}#{t}" for s, t in zip(row, timeseq)])


fold2 = lineseq[elems_per_fold:2*elems_per_fold]
fold2_t = timeseqs[elems_per_fold:2*elems_per_fold]
fold2_t2 = timeseqs2[elems_per_fold:2*elems_per_fold]
fold2_t3 = timeseqs3[elems_per_fold:2*elems_per_fold]
fold2_t4 = timeseqs4[elems_per_fold:2*elems_per_fold]
with open('code/output_files/folds/fold2.csv', 'w', newline='') as csvfile:
    datawriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold2, fold2_t):
        datawriter.writerow([f"{s}#{t}" for s, t in zip(row, timeseq)])

fold3 = lineseq[2*elems_per_fold:]
fold3_t = timeseqs[2*elems_per_fold:]
fold3_t2 = timeseqs2[2*elems_per_fold:]
fold3_t3 = timeseqs3[2*elems_per_fold:]
fold3_t4 = timeseqs4[2*elems_per_fold:]
with open('code/output_files/folds/fold3.csv','w', newline='') as csvfile:
    datawriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for row, timeseq in zip(fold3, fold3_t):
        datawriter.writerow([f"{s}#{t}" for s, t in zip(row, timeseq)])

divisor: 8329.618428334255
divisor2: 257467.7155506364


In [27]:
lines = fold1 + fold2
lines_t = fold1_t + fold2_t
lines_t2 = fold1_t2 + fold2_t2
lines_t3 = fold1_t3 + fold2_t3
lines_t4 = fold1_t4 + fold2_t4

step = 1
sentences = []
softness = 0
next_chars = []

In [37]:
#helper variables
step = 1
sentences = []
softness = 0
next_chars = []
sentences_t = []
sentences_t2 = []
sentences_t3 = []
sentences_t4 = []
next_chars_t = []
next_chars_t2 = []
next_chars_t3 = []
next_chars_t4 = []
maxlen = find_max_list(lines) #find maximum line size

for line, line_t, line_t2, line_t3, line_t4 in zip(lines, lines_t, lines_t2, lines_t3, lines_t4):
    for i in range(0, len(line), step):
        if i==0:
            continue
        #we take all the prefix traces of the traces
        #we add iteratively, first symbol of the line, then two first, three...
        sentences.append(line[0:i])
        sentences_t.append(line_t[0:i])
        sentences_t2.append(line_t2[0:i])
        sentences_t3.append(line_t3[0:i])
        sentences_t4.append(line_t4[0:i])
        next_chars.append(line[i])
        if i==len(line)-1: # special case to deal time of end character
            next_chars_t.append(0)
            next_chars_t2.append(0)
            next_chars_t3.append(0)
            next_chars_t4.append(0)
        else:
            next_chars_t.append(line_t[i])
            next_chars_t2.append(line_t2[i])
            next_chars_t3.append(line_t3[i])
            next_chars_t4.append(line_t4[i])
activities = set([item for sublist in lineseq for item in sublist])
target_activities = copy.copy(activities)
print('there are now:', len(sentences), 'prefix sequences')
print('there are:', len(activities), 'different activities')

there are now: 6509 prefix sequences
there are: 15 different activities


In [29]:
print('total activities: {}, target activities: {}'.format(len(activities), len(target_activities)))
act_indices = dict((c, i) for i, c in enumerate(activities))
indices_act = dict((i, c) for i, c in enumerate(activities))
target_activities_indices = dict((c, i) for i, c in enumerate(target_activities))
target_indices_activities = dict((i, c) for i, c in enumerate(target_activities))
print(indices_act)

print('Vectorization...')
num_features = len(activities)+5
print('num features: {}'.format(num_features))
X = np.zeros((len(sentences), maxlen, num_features), dtype=np.float32)
y_a = np.zeros((len(sentences), len(target_activities)), dtype=np.float32)
y_t = np.zeros((len(sentences)), dtype=np.float32)

total activities: 15, target activities: 15
{0: '13', 1: '3', 2: '7', 3: '4', 4: '5', 5: '12', 6: '10', 7: '14', 8: '6', 9: '11', 10: '1', 11: '15', 12: '9', 13: '8', 14: '2'}
Vectorization...
num features: 20


In [30]:
for i, sentence in enumerate(sentences):
    leftpad = maxlen-len(sentence)
    next_t = next_chars_t[i]
    sentence_t = sentences_t[i]
    sentence_t2 = sentences_t2[i]
    sentence_t3 = sentences_t3[i]
    sentence_t4 = sentences_t4[i]
    for t, char in enumerate(sentence):
        multiset_abstraction = Counter(sentence[:t+1])
        for c in activities:
            if c==char: #this will encode present events to the right places
                X[i, t+leftpad, act_indices[c]] = 1
        X[i, t+leftpad, len(activities)] = t+1
        X[i, t+leftpad, len(activities)+1] = sentence_t[t]/divisor
        X[i, t+leftpad, len(activities)+2] = sentence_t2[t]/divisor2
        X[i, t+leftpad, len(activities)+3] = sentence_t3[t]/86400
        X[i, t+leftpad, len(activities)+4] = sentence_t4[t]/7
    for c in target_activities:
        if c==next_chars[i]:
            y_a[i, target_activities_indices[c]] = 1-softness
        else:
            y_a[i, target_activities_indices[c]] = softness/(len(target_activities)-1)
    y_t[i] = next_t/divisor

In [35]:
# build the model: 
print('Build model...')
main_input = Input(shape=(maxlen, num_features), name='main_input')
# train a 2-layer LSTM with one shared layer
l1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
b1 = BatchNormalization()(l1)
l2_1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in activity prediction
b2_1 = BatchNormalization()(l2_1)
l2_2 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
act_output = Dense(len(target_activities), activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)

model = Model(inputs=[main_input], outputs=[act_output, time_output])

opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clipvalue=3)

model.compile(loss={'act_output':'categorical_crossentropy', 'time_output':'mae'}, optimizer=opt)
early_stopping = EarlyStopping(monitor='val_loss', patience=42)
model_checkpoint = ModelCheckpoint('output_files/models/model_{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

model.fit(X, {'act_output':y_a, 'time_output':y_t}, validation_split=0.2, verbose=2, callbacks=[early_stopping, model_checkpoint, lr_reducer], batch_size=maxlen, epochs=100)

Build model...
Epoch 1/100
44/44 - 43s - loss: 3.4811 - act_output_loss: 2.1214 - time_output_loss: 1.3597 - val_loss: 3.0428 - val_act_output_loss: 1.9265 - val_time_output_loss: 1.1164 - lr: 0.0020 - 43s/epoch - 974ms/step
Epoch 2/100
44/44 - 39s - loss: 2.7725 - act_output_loss: 1.6469 - time_output_loss: 1.1257 - val_loss: 2.8837 - val_act_output_loss: 1.7658 - val_time_output_loss: 1.1179 - lr: 0.0020 - 39s/epoch - 893ms/step
Epoch 3/100
44/44 - 35s - loss: 2.5864 - act_output_loss: 1.4963 - time_output_loss: 1.0901 - val_loss: 2.7710 - val_act_output_loss: 1.7175 - val_time_output_loss: 1.0535 - lr: 0.0020 - 35s/epoch - 805ms/step
Epoch 4/100
44/44 - 37s - loss: 2.4216 - act_output_loss: 1.3498 - time_output_loss: 1.0718 - val_loss: 2.6822 - val_act_output_loss: 1.6190 - val_time_output_loss: 1.0632 - lr: 0.0020 - 37s/epoch - 839ms/step
Epoch 5/100
44/44 - 43s - loss: 2.2748 - act_output_loss: 1.2116 - time_output_loss: 1.0631 - val_loss: 2.6698 - val_act_output_loss: 1.5921 - va

<keras.src.callbacks.History at 0x192451d1010>

In [34]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 main_input (InputLayer)     [(None, 119, 20)]            0         []                            
                                                                                                  
 lstm_6 (LSTM)               (None, 119, 100)             48400     ['main_input[0][0]']          
                                                                                                  
 batch_normalization_6 (Bat  (None, 119, 100)             400       ['lstm_6[0][0]']              
 chNormalization)                                                                                 
                                                                                                  
 lstm_7 (LSTM)               (None, 100)                  80400     ['batch_normalization_6[

In [49]:
print('ok')

ok


In [None]:
df['Pump Circulation Flow']

In [None]:
print('number of activities', df['concept:name'].nunique())
df['concept:name'].unique()

In [None]:
df['batch_id'] = df.index.get_level_values(0)

In [None]:
print('there are',df['batch_id'].nunique(), 'case IDs')