In [1]:
## import all necessary libraries ##
import distance

from similarity.levenshtein import Levenshtein
levenshtein = Levenshtein()

from similarity.damerau import Damerau
damerau = Damerau()

from pyjarowinkler import distance as jwdistance
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

from similarity.weighted_levenshtein import WeightedLevenshtein
from similarity.weighted_levenshtein import CharacterSubstitutionInterface
import math
class CharacterSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        return math.inf # assign inifte weight to all substitutions
levenshtein2 = WeightedLevenshtein(CharacterSubstitution())

#########################

import numpy as np
from datetime import datetime
import random 

#########################

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter


import os.path

In [22]:
## import dataset ##
file = "BPIC15"
log = xes_importer.apply("../Datasets/"+ file + ".xes") # adjust for local file location

parsing log, completed traces ::   0%|          | 0/1199 [00:00<?, ?it/s]

In [23]:
## generate mapping from activities to chars ##

# extract all activities from log and convert them into list
activities = list(attributes_filter.get_attribute_values(log, "concept:name").keys())

transl = {} # dictionary mapping the names of the activities to chars
for i, a in enumerate(activities):
    transl[a] = chr(i+1)

def list_to_string(trace):
    string = ""
    for e in trace:
        string = string + transl[e] 
    return string

In [24]:
## generate list of strings for all variants ##

variants = list(variants_filter.get_variants(log).keys())
strings = [] # list of strings for each variant
variant_to_index = {} # dicitionary to translate variant to index in list for later lookup of traces

# generate strings
for i, v in enumerate(variants):
    strings.append(list_to_string(v.split(",")))
    variant_to_index[v] = i

# translation function: trace to index
def trace_to_index(trace):
    # convert trace to string representation of variant (concept:name separated by commas)
    trace_string = ""
    for e in trace:
        trace_string = trace_string + e["concept:name"] + ","
    trace_string = trace_string[:-1] # remove last comma
    
    # lookup string representation of variant
    return variant_to_index[trace_string]
        
print("First 5 strings:", strings[:5])
print("No. of strings:", len(strings))

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-', '\x06\x07\x08\x02\x03\x04\x05\x01\n\t\x0c\x0b\r\x0e./\x19012345678\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18#$\x0e*)\x1d+ \x1b\x1e!\x1f%"\'\x1a\x1c-&(,', '\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\n\r\x0e./\x190124356\x0e8\x0f\x10\x11\x12\x13\x14\x15\x16\x177"\x1b\x1d\x1c\x1f\x1e !\x18\x1a#$\'%*+),-&(', '\x01\x02\x03\x04\x05\x06\x07\x08\n\x0c\x0b\t\r\x0e./\x19102476358\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x0e!\x1c\x1d\x1e \x1f"\x1b\x1a#$%\'+9)*(&,-', '\x01\x02\x03\x04\x05\x06\x08\t\x0b\n\x0c\r\x19\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x07! \x1a\x1b\x1d\x1c\x1f"\x1e#$*%\'9+)&-(,']
No. of strings: 1170


In [25]:
## optional padding when using Hamming distance ##

count = 0
for s in strings:
    if len(s) > count:
        count = len(s)

strings_pad = []

for s in strings:
    strings_pad.append(s + chr(0) * (count - len(s)))

#strings = strings_pad

print("First 5 strings:", strings_pad[:5])

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', '\x06\x07\x08\x02\x03\x04\x05\x01\n\t\x0c\x0b\r\x0e./\x19012345678\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18#$\x0e*)\x1d+ \x1b\x1e!\x1f%"\'\x1a\x1c-&(,\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', '\x01\x02\x03\x04\x05\x06\x07\x08\t\x0b\x0c\n\r\x0e./\x190124356\x0e8\x0f\x10\x11\x12\x13\x14\x15\x16\x177"\x1b\x1d\x1c\x1f\x1e !\x18\x1a#$\'%*+),-&(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

# Prefix-removal + String-split

In [15]:
## calculation of distance matrix ##

#strings = strings_pad # optional use of padded strings for Hamming
n = len(strings)
distMatrix = np.full((n, n), 0, dtype = np.uint8)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            #dist1 = distance.hamming(x_pref1, y_pref1) # Hamming distance
            #dist2 = distance.hamming(x_pref2, y_pref2)
            
            #dist1 = levenshtein.distance(x_pref1, y_pref1) # Levenshtein v2 (faster than v1)
            #dist2 = levenshtein.distance(x_pref2, y_pref2)
            
            #dist1 = levenshtein2.distance(x_pref1, y_pref1) # Levenshtein II
            #dist2 = levenshtein2.distance(x_pref2, y_pref2)

            dist1 = damerau.distance(x_pref1, y_pref1) # Damerau-Levenshtein
            dist2 = damerau.distance(x_pref2, y_pref2)
            
            #dist1 = jarowinkler.similarity(x1, y_pref1) * 255 / 2 # Jaro-Winkler v2 (faster than v1)
            #dist2 = jarowinkler.similarity(x_pref2, y_pref2) * 255 /2

            dist = dist1 + dist2

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-Levenshtein-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)

Start: 2021-01-30 20:56:42.357126
Time needed: 0:00:01.393797
[[0 2 1 ... 4 4 4]
 [2 0 1 ... 2 5 2]
 [1 1 0 ... 3 4 3]
 ...
 [4 2 3 ... 0 7 3]
 [4 5 4 ... 7 0 7]
 [4 2 3 ... 3 7 0]]


# Calculation for all methods

In [26]:
#strings = strings_pad # optional use of padded strings for Hamming
n = len(strings)

times = np.empty(5, dtype='object')

######################
### Hamming ###
######################
start = datetime.now()
distMatrix = np.full((n, n), 0, dtype = np.uint8)
print("Start:", start)

for i, x in enumerate(strings_pad):
    for j, y in enumerate(strings_pad):
        if j >= i: # only calculate upper right triangle of matrix
            # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            dist1 = distance.hamming(x_pref1, y_pref1) # Hamming distance
            dist2 = distance.hamming(x_pref2, y_pref2)
            
            dist = dist1 + dist2

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-Hamming-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)
times[0] = str(end-start)

######################
### Levenshtein ###
######################

start = datetime.now()
distMatrix = np.full((n, n), 0, dtype = np.uint8)

print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            dist1 = levenshtein.distance(x_pref1, y_pref1) # Levenshtein v2 (faster than v1)
            dist2 = levenshtein.distance(x_pref2, y_pref2)

            dist = dist1 + dist2

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-Levenshtein-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)
times[1] = str(end-start)

######################
### Levenshtein II ###
######################

start = datetime.now()
distMatrix = np.full((n, n), 0, dtype = np.uint8)
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
             # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            dist1 = levenshtein2.distance(x_pref1, y_pref1) # Levenshtein II
            dist2 = levenshtein2.distance(x_pref2, y_pref2)

            dist = dist1 + dist2

            distMatrix[i][j] = dist


# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-Levenshtein2-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)
times[2] = str(end-start)

######################
### Damerau-Levenshtein ###
######################

start = datetime.now()
distMatrix = np.full((n, n), 0, dtype = np.uint8)
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
             # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            dist1 = damerau.distance(x_pref1, y_pref1) # Damerau-Levenshtein
            dist2 = damerau.distance(x_pref2, y_pref2)
            
            dist = dist1 + dist2

            distMatrix[i][j] = dist


# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-Damerau-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)
times[3] = str(end-start)

######################
### Jaro-Winkler ###
######################

start = datetime.now()
distMatrix = np.full((n, n), 0, dtype = np.uint8)
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            # prefix removal
            prefix_len = len(os.path.commonprefix([x, y]))
            x_pref = x[prefix_len:]
            y_pref = y[prefix_len:]
            
            # string split
            x_pref_split = round(len(x_pref)/2)
            x_pref1 = x_pref[:x_pref_split]
            x_pref2 = x_pref[x_pref_split:]
            y_pref_split = round(len(y_pref)/2)
            y_pref1 = y_pref[:y_pref_split]
            y_pref2 = y_pref[y_pref_split:]
            
            dist1 = jarowinkler.similarity(x_pref1, y_pref1) * 255 / 2 # Jaro-Winkler v2 (faster than v1)
            dist2 = jarowinkler.similarity(x_pref2, y_pref2) * 255 /2

            dist = dist1 + dist2

            distMatrix[i][j] = dist

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-JaroWinkler-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)
times[4] = str(end-start)



###################
### End ###
###################

np.savetxt("../Results/PrefixSplit/PrefixSplit-" + file + "-times-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + ".csv", times, fmt="%s", delimiter = ";")


Start: 2021-01-30 21:28:53.065319
Time needed: 0:00:10.741896
[[ 0 53 46 ... 24 63 48]
 [53  0 36 ... 53 60 47]
 [46 36  0 ... 55 58 37]
 ...
 [24 53 55 ...  0 64 56]
 [63 60 58 ... 64  0 61]
 [48 47 37 ... 56 61  0]]
Start: 2021-01-30 21:29:04.116133
Time needed: 0:05:05.962109
[[ 0 46 36 ... 20 43 36]
 [46  0 30 ... 44 53 33]
 [36 30  0 ... 43 44 28]
 ...
 [20 44 43 ...  0 45 41]
 [43 53 44 ... 45  0 42]
 [36 33 28 ... 41 42  0]]
Start: 2021-01-30 21:34:10.393979
Time needed: 0:09:24.221129
[[ 0 64 40 ... 28 55 43]
 [64  0 44 ... 60 75 49]
 [40 44  0 ... 52 65 35]
 ...
 [28 60 52 ...  0 63 47]
 [55 75 65 ... 63  0 56]
 [43 49 35 ... 47 56  0]]
Start: 2021-01-30 21:43:34.918519
Time needed: 0:11:47.679154
[[ 0 46 36 ... 20 42 34]
 [46  0 29 ... 43 52 31]
 [36 29  0 ... 43 44 26]
 ...
 [20 43 43 ...  0 45 41]
 [42 52 44 ... 45  0 42]
 [34 31 26 ... 41 42  0]]
Start: 2021-01-30 21:55:22.893837
Time needed: 0:01:07.675914
[[254 171 157 ... 234 187 188]
 [171 254 228 ... 171 168 207]
 [15