add file the cycle oppression feature added to original predictions

yesanton · Feb 7, 2017 · 050fb79 · 050fb79
1 parent 4e67dcc
commit 050fb79
Show file tree

Hide file tree

Showing 3 changed files with 278 additions and 1 deletion.
diff --git a/src/calculate_accuracy_on_next_event.py b/src/calculate_accuracy_on_next_event.py
@@ -23,7 +23,7 @@ def output(eventlogs, number_logs = 3):
     averageTraceLengthsGroundTruth = 0
     mark = False
 
-    for i in range(number_logs):
+    for i in range(number_logs+1):
         csvfile = open('output_files/results/suffix_and_remaining_time' + str(i) + '_%s' % eventlog, 'r')
         r = unicodecsv.reader(csvfile ,encoding='utf-8')
         r.next() # header

diff --git a/src/compliant_predictions/cycles_bs_bt_evaluate_compliant_suffix.py b/src/compliant_predictions/cycles_bs_bt_evaluate_compliant_suffix.py
@@ -0,0 +1,9 @@
+'''
+this file is build based on the code found in evaluate_suffix_and_remaining_time.py
+
+here the beam search (with backtracking) is implemented, to find compliant prediction
+
+also the cycles supression functionality is added
+
+Author: Anton Yeshchenko
+'''
diff --git a/src/cycle_optimization/cycles_evaluate_suffix.py b/src/cycle_optimization/cycles_evaluate_suffix.py
@@ -0,0 +1,268 @@
+'''
+EVALUATES the trained model with the techniques for diminishing the cycle repetitions
+
+Author: Anton Yeshchenko
+'''
+
+from __future__ import division
+
+import re
+
+from keras.models import load_model
+import csv
+import copy
+import numpy as np
+import distance
+from itertools import izip
+from jellyfish._jellyfish import damerau_levenshtein_distance
+import unicodecsv
+from sklearn import metrics
+from math import sqrt
+import time
+from datetime import datetime, timedelta
+import matplotlib.pyplot as plt
+from collections import Counter
+
+from src.formula_verificator import verify_formula_as_compliant
+from src.shared_variables import eventlog, getUnicode_fromInt, path_to_model_file
+
+start_time = time.time()
+
+csvfile = open('../../data/%s' % eventlog, 'r')
+spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
+
+next(spamreader, None)  # skip the headers
+
+
+lastcase = ''
+line = ''
+firstLine = True
+lines = []
+timeseqs = []  # relative time since previous event
+timeseqs2 = [] # relative time since case start
+timeseqs3 = [] # absolute time of previous event
+times = []
+times2 = []
+times3 = []
+numlines = 0
+casestarttime = None
+lasteventtime = None
+
+for row in spamreader:
+    t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S")
+    if row[0]!=lastcase:
+        casestarttime = t
+        lasteventtime = t
+        lastcase = row[0]
+        if not firstLine:
+            lines.append(line)
+            timeseqs.append(times)
+            timeseqs2.append(times2)
+            timeseqs3.append(times3)
+        line = ''
+        times = []
+        times2 = []
+        times3 = []
+        numlines+=1
+    line+= getUnicode_fromInt(row[1])
+    timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime))
+    timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime))
+    midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0)
+    timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight
+    timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds
+    timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds
+    times.append(timediff)
+    times2.append(timediff2)
+    times3.append(datetime.fromtimestamp(time.mktime(t)))
+    lasteventtime = t
+    firstLine = False
+
+# add last case
+lines.append(line)
+timeseqs.append(times)
+timeseqs2.append(times2)
+timeseqs3.append(times3)
+numlines+=1
+
+divisor = np.mean([item for sublist in timeseqs for item in sublist])
+print('divisor: {}'.format(divisor))
+divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist])
+print('divisor2: {}'.format(divisor2))
+divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2))
+print('divisor3: {}'.format(divisor3))
+
+elems_per_fold = int(round(numlines/3))
+
+fold1and2lines = lines[:2*elems_per_fold]
+
+step = 1
+sentences = []
+softness = 0
+next_chars = []
+fold1and2lines = map(lambda x: x+'!',fold1and2lines)
+maxlen = max(map(lambda x: len(x),fold1and2lines))
+
+chars = map(lambda x : set(x),fold1and2lines)
+chars = list(set().union(*chars))
+chars.sort()
+target_chars = copy.copy(chars)
+chars.remove('!')
+print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars)))
+char_indices = dict((c, i) for i, c in enumerate(chars))
+indices_char = dict((i, c) for i, c in enumerate(chars))
+target_char_indices = dict((c, i) for i, c in enumerate(target_chars))
+target_indices_char = dict((i, c) for i, c in enumerate(target_chars))
+print(indices_char)
+
+#we only need the third fold, because first two were used for training
+
+fold3 = lines[2*elems_per_fold:]
+fold3_t = timeseqs[2*elems_per_fold:]
+fold3_t2 = timeseqs2[2*elems_per_fold:]
+fold3_t3 = timeseqs3[2*elems_per_fold:]
+
+lines = fold3
+lines_t = fold3_t
+lines_t2 = fold3_t2
+lines_t3 = fold3_t3
+
+# set parameters
+predict_size = maxlen
+
+# load model, set this to the model generated by train.py
+model = load_model(path_to_model_file)
+
+# define helper functions
+
+#this one encodes the current sentence into the onehot encoding
+def encode(sentence, times, times3, maxlen=maxlen):
+    num_features = len(chars)+5
+    X = np.zeros((1, maxlen, num_features), dtype=np.float32)
+    leftpad = maxlen-len(sentence)
+    times2 = np.cumsum(times)
+    for t, char in enumerate(sentence):
+        midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0)
+        timesincemidnight = times3[t]-midnight
+        multiset_abstraction = Counter(sentence[:t+1])
+        for c in chars:
+            if c==char:
+                X[0, t+leftpad, char_indices[c]] = 1
+        X[0, t+leftpad, len(chars)] = t+1
+        X[0, t+leftpad, len(chars)+1] = times[t]/divisor
+        X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2
+        X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400
+        X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7
+    return X
+
+
+
+#find cycles and modify the probability functionality goes here
+stop_symbol_probability_amplifier_current = 1
+
+regex = re.compile(r'(.+.+)(\1)+')
+match = regex.search('3 0 5 5 1 5 1 6 8')
+
+
+#modify to be able to get second best prediction
+def getSymbol(predictions, ith_best = 0):
+    predictions[0] =  predictions[0] * stop_symbol_probability_amplifier_current
+    i = np.argsort(predictions)[len(predictions) - ith_best - 1]
+    return target_indices_char[i]
+
+one_ahead_gt = []
+one_ahead_pred = []
+
+two_ahead_gt = []
+two_ahead_pred = []
+
+three_ahead_gt = []
+three_ahead_pred = []
+
+#select only lines with formula verified
+lines_v = []
+lines_t_v = []
+lines_t2_v = []
+lines_t3_v = []
+for line, times, times2, times3 in izip(lines, lines_t, lines_t2, lines_t3):
+    if verify_formula_as_compliant(line):
+        lines_v.append(line)
+        lines_t_v.append(times)
+        lines_t2_v.append(times2)
+        lines_t3_v.append(times3)
+
+lines = lines_v
+lines_t = lines_t_v
+lines_t2 = lines_t2_v
+lines_t3 = lines_t3_v
+
+
+
+
+
+
+with open('../output_files/results/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile:
+    spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+    spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"])
+    for prefix_size in range(10,11):
+        print(prefix_size)
+        for line, times, times2, times3 in izip(lines, lines_t, lines_t2, lines_t3):
+            times.append(0)
+            cropped_line = ''.join(line[:prefix_size])
+            cropped_times = times[:prefix_size]
+            cropped_times3 = times3[:prefix_size]
+            if len(times2)<prefix_size:
+                continue # make no prediction for this case, since this case has ended already
+            ground_truth = ''.join(line[prefix_size:prefix_size+predict_size])
+            ground_truth_t = times2[prefix_size-1]
+            case_end_time = times2[len(times2)-1]
+            ground_truth_t = case_end_time-ground_truth_t
+            predicted = ''
+            total_predicted_time = 0
+            for i in range(predict_size):
+                enc = encode(cropped_line, cropped_times, cropped_times3)
+                y = model.predict(enc, verbose=0) # make predictions
+                # split predictions into seperate activity and time predictions
+                y_char = y[0][0]
+                y_t = y[1][0][0]
+                prediction = getSymbol(y_char) # undo one-hot encoding
+                cropped_line += prediction
+
+                match = regex.search(cropped_line, re.UNICODE)
+                #the match.group(0) finds the whole substring that contains 1+ cycles
+                #the match.group(1) finds the substring that indicates the cycle
+
+                if match != None:
+                    stop_symbol_probability_amplifier_current = np.math.exp(len(match.group(0)) / len(match.group(1)))
+
+
+
+                if y_t<0:
+                    y_t=0
+                cropped_times.append(y_t)
+                if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future
+                    one_ahead_pred.append(total_predicted_time)
+                    one_ahead_gt.append(ground_truth_t)
+                    print('! predicted, end case')
+                    break
+                y_t = y_t * divisor3
+                cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t))
+                total_predicted_time = total_predicted_time + y_t
+                predicted += prediction
+            output = []
+            if len(ground_truth)>0:
+                output.append(prefix_size)
+                output.append(unicode(ground_truth).encode("utf-8"))
+                output.append(unicode(predicted).encode("utf-8"))
+                output.append(1 - distance.nlevenshtein(predicted, ground_truth))
+                dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth)))
+                if dls<0:
+                    dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case
+                output.append(dls)
+                output.append(1 - distance.jaccard(predicted, ground_truth))
+                output.append(ground_truth_t)
+                output.append(total_predicted_time)
+                output.append('')
+                output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time]))
+                output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time]))
+                spamwriter.writerow(output)
+print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))