Skip to content

Commit

Permalink
add file the cycle oppression feature added to original predictions
Browse files Browse the repository at this point in the history
  • Loading branch information
yesanton committed Feb 7, 2017
1 parent 4e67dcc commit 050fb79
Show file tree
Hide file tree
Showing 3 changed files with 278 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/calculate_accuracy_on_next_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def output(eventlogs, number_logs = 3):
averageTraceLengthsGroundTruth = 0
mark = False

for i in range(number_logs):
for i in range(number_logs+1):
csvfile = open('output_files/results/suffix_and_remaining_time' + str(i) + '_%s' % eventlog, 'r')
r = unicodecsv.reader(csvfile ,encoding='utf-8')
r.next() # header
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
'''
this file is build based on the code found in evaluate_suffix_and_remaining_time.py
here the beam search (with backtracking) is implemented, to find compliant prediction
also the cycles supression functionality is added
Author: Anton Yeshchenko
'''
268 changes: 268 additions & 0 deletions src/cycle_optimization/cycles_evaluate_suffix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
'''
EVALUATES the trained model with the techniques for diminishing the cycle repetitions
Author: Anton Yeshchenko
'''

from __future__ import division

import re

from keras.models import load_model
import csv
import copy
import numpy as np
import distance
from itertools import izip
from jellyfish._jellyfish import damerau_levenshtein_distance
import unicodecsv
from sklearn import metrics
from math import sqrt
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import Counter

from src.formula_verificator import verify_formula_as_compliant
from src.shared_variables import eventlog, getUnicode_fromInt, path_to_model_file

start_time = time.time()

csvfile = open('../../data/%s' % eventlog, 'r')
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')

next(spamreader, None) # skip the headers


lastcase = ''
line = ''
firstLine = True
lines = []
timeseqs = [] # relative time since previous event
timeseqs2 = [] # relative time since case start
timeseqs3 = [] # absolute time of previous event
times = []
times2 = []
times3 = []
numlines = 0
casestarttime = None
lasteventtime = None

for row in spamreader:
t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
if row[0]!=lastcase:
casestarttime = t
lasteventtime = t
lastcase = row[0]
if not firstLine:
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
line = ''
times = []
times2 = []
times3 = []
numlines+=1
line+= getUnicode_fromInt(row[1])
timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
times.append(timediff)
times2.append(timediff2)
times3.append(datetime.fromtimestamp(time.mktime(t)))
lasteventtime = t
firstLine = False

# add last case
lines.append(line)
timeseqs.append(times)
timeseqs2.append(times2)
timeseqs3.append(times3)
numlines+=1

divisor = np.mean([item for sublist in timeseqs for item in sublist])
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
print('divisor2: {}'.format(divisor2))
divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2))
print('divisor3: {}'.format(divisor3))

elems_per_fold = int(round(numlines/3))

fold1and2lines = lines[:2*elems_per_fold]

step = 1
sentences = []
softness = 0
next_chars = []
fold1and2lines = map(lambda x: x+'!',fold1and2lines)
maxlen = max(map(lambda x: len(x),fold1and2lines))

chars = map(lambda x : set(x),fold1and2lines)
chars = list(set().union(*chars))
chars.sort()
target_chars = copy.copy(chars)
chars.remove('!')
print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
print(indices_char)

#we only need the third fold, because first two were used for training

fold3 = lines[2*elems_per_fold:]
fold3_t = timeseqs[2*elems_per_fold:]
fold3_t2 = timeseqs2[2*elems_per_fold:]
fold3_t3 = timeseqs3[2*elems_per_fold:]

lines = fold3
lines_t = fold3_t
lines_t2 = fold3_t2
lines_t3 = fold3_t3

# set parameters
predict_size = maxlen

# load model, set this to the model generated by train.py
model = load_model(path_to_model_file)

# define helper functions

#this one encodes the current sentence into the onehot encoding
def encode(sentence, times, times3, maxlen=maxlen):
num_features = len(chars)+5
X = np.zeros((1, maxlen, num_features), dtype=np.float32)
leftpad = maxlen-len(sentence)
times2 = np.cumsum(times)
for t, char in enumerate(sentence):
midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
timesincemidnight = times3[t]-midnight
multiset_abstraction = Counter(sentence[:t+1])
for c in chars:
if c==char:
X[0, t+leftpad, char_indices[c]] = 1
X[0, t+leftpad, len(chars)] = t+1
X[0, t+leftpad, len(chars)+1] = times[t]/divisor
X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2
X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400
X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7
return X



#find cycles and modify the probability functionality goes here
stop_symbol_probability_amplifier_current = 1

regex = re.compile(r'(.+.+)(\1)+')
match = regex.search('3 0 5 5 1 5 1 6 8')


#modify to be able to get second best prediction
def getSymbol(predictions, ith_best = 0):
predictions[0] = predictions[0] * stop_symbol_probability_amplifier_current
i = np.argsort(predictions)[len(predictions) - ith_best - 1]
return target_indices_char[i]

one_ahead_gt = []
one_ahead_pred = []

two_ahead_gt = []
two_ahead_pred = []

three_ahead_gt = []
three_ahead_pred = []

#select only lines with formula verified
lines_v = []
lines_t_v = []
lines_t2_v = []
lines_t3_v = []
for line, times, times2, times3 in izip(lines, lines_t, lines_t2, lines_t3):
if verify_formula_as_compliant(line):
lines_v.append(line)
lines_t_v.append(times)
lines_t2_v.append(times2)
lines_t3_v.append(times3)

lines = lines_v
lines_t = lines_t_v
lines_t2 = lines_t2_v
lines_t3 = lines_t3_v






with open('../output_files/results/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"])
for prefix_size in range(10,11):
print(prefix_size)
for line, times, times2, times3 in izip(lines, lines_t, lines_t2, lines_t3):
times.append(0)
cropped_line = ''.join(line[:prefix_size])
cropped_times = times[:prefix_size]
cropped_times3 = times3[:prefix_size]
if len(times2)<prefix_size:
continue # make no prediction for this case, since this case has ended already
ground_truth = ''.join(line[prefix_size:prefix_size+predict_size])
ground_truth_t = times2[prefix_size-1]
case_end_time = times2[len(times2)-1]
ground_truth_t = case_end_time-ground_truth_t
predicted = ''
total_predicted_time = 0
for i in range(predict_size):
enc = encode(cropped_line, cropped_times, cropped_times3)
y = model.predict(enc, verbose=0) # make predictions
# split predictions into seperate activity and time predictions
y_char = y[0][0]
y_t = y[1][0][0]
prediction = getSymbol(y_char) # undo one-hot encoding
cropped_line += prediction

match = regex.search(cropped_line, re.UNICODE)
#the match.group(0) finds the whole substring that contains 1+ cycles
#the match.group(1) finds the substring that indicates the cycle

if match != None:
stop_symbol_probability_amplifier_current = np.math.exp(len(match.group(0)) / len(match.group(1)))



if y_t<0:
y_t=0
cropped_times.append(y_t)
if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
one_ahead_pred.append(total_predicted_time)
one_ahead_gt.append(ground_truth_t)
print('! predicted, end case')
break
y_t = y_t * divisor3
cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
total_predicted_time = total_predicted_time + y_t
predicted += prediction
output = []
if len(ground_truth)>0:
output.append(prefix_size)
output.append(unicode(ground_truth).encode("utf-8"))
output.append(unicode(predicted).encode("utf-8"))
output.append(1 - distance.nlevenshtein(predicted, ground_truth))
dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
if dls<0:
dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
output.append(dls)
output.append(1 - distance.jaccard(predicted, ground_truth))
output.append(ground_truth_t)
output.append(total_predicted_time)
output.append('')
output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time]))
output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time]))
spamwriter.writerow(output)
print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))

0 comments on commit 050fb79

Please sign in to comment.