In [1]:
## import all necessary libraries ##
import distance

from similarity.levenshtein import Levenshtein
levenshtein = Levenshtein()

from similarity.damerau import Damerau
damerau = Damerau()

from pyjarowinkler import distance as jwdistance
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

from similarity.weighted_levenshtein import WeightedLevenshtein
from similarity.weighted_levenshtein import CharacterSubstitutionInterface
import math
class CharacterSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        return math.inf # assign inifte weight to all substitutions
levenshtein2 = WeightedLevenshtein(CharacterSubstitution())

#########################

import numpy as np
from datetime import datetime
import random 

#########################

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter

In [2]:
## import dataset ##
file = "BPIC19"
log = xes_importer.apply("../Datasets/"+ file + ".xes") # adjust for local file location

parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

In [3]:
## generate mapping from activities to chars ##

# extract all activities from log and convert them into list
activities = list(attributes_filter.get_attribute_values(log, "concept:name").keys())

transl = {} # dictionary mapping the names of the activities to chars
for i, a in enumerate(activities):
    transl[a] = chr(i+1)

def list_to_string(trace):
    string = ""
    for e in trace:
        string = string + transl[e] 
    return string

In [4]:
## generate list of strings for all variants ##

variants = list(variants_filter.get_variants(log).keys())
strings = [] # list of strings for each variant
variant_to_index = {} # dicitionary to translate variant to index in list for later lookup of traces

# generate strings
for i, v in enumerate(variants):
    strings.append(list_to_string(v.split(",")))
    variant_to_index[v] = i

# translation function: trace to index
def trace_to_index(trace):
    # convert trace to string representation of variant (concept:name separated by commas)
    trace_string = ""
    for e in trace:
        trace_string = trace_string + e["concept:name"] + ","
    trace_string = trace_string[:-1] # remove last comma
    
    # lookup string representation of variant
    return variant_to_index[trace_string]
        
print("First 5 strings:", strings[:5])
print("No. of strings:", len(strings))

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c', '\x01\x02\x03\x04\x06\x05\x07\x08\t\n\r\x0b\x0c\x05\x0e', '\x01\x02\x03\x04\x06\x05\x07\x08\n\r\t\x0f\n\r\x0b\x0c\x05\x0e', '\x01\x02\x03\x04\x05\x06\x08\x07\t\x10\n\x0b\x0c\x11\x0b\x0c', '\x01\x02\x03\x04\x05\x06\x08\x07']
No. of strings: 11973


In [84]:
## optional padding when using Hamming distance ##

count = 0
for s in strings:
    if len(s) > count:
        count = len(s)

strings_pad = []

for s in strings:
    strings_pad.append(s + chr(0) * (count - len(s)))

print("First 5 strings:", strings_pad[:5])

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\

# Generic calculation

In [None]:
## calculation of distance matrix ##

#strings = strings_pad # optional use of padded strings for Hamming
n = len(strings)
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            #dist = distance.hamming(x, y) # Hamming distance
            #dist = levenshtein.distance(x, y) # Levenshtein v2 (faster than v1)
            dist = levenshtein2.distance(x, y) # Levenshtein II
            #dist = damerau.distance(x, y) # Damerau-Levenshtein
            #dist = jarowinkler.similarity(x, y) # Jaro-Winkler v2 (faster than v1)

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("Results/" + file + "-Levenshtein.csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)

Start: 2021-01-24 10:20:45.006312


In [94]:
np.savetxt("Results/" + file + "-Levenshtein.csv", distMatrix1, fmt='%i', delimiter = ";")

In [93]:
distMatrix1 = distMatrix + distMatrix.T

# Calculation for all measures

In [85]:
n = len(strings)
times = np.empty(5, dtype='object')

################################################################
## Hamming ##
################################################################
print("### Hamming ###")
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings_pad):
    for j, y in enumerate(strings_pad):
        if j >= i: # only calculate upper right triangle of matrix
            dist = distance.hamming(x, y) # Hamming distance
        
            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()

print("Time needed:", end-start)
print(distMatrix)
print("\n")
np.savetxt("Results/" + file + "-Hamming.csv", distMatrix, fmt='%i', delimiter = ";")
times[0] = str(end-start)

################################################################
## Levenshtein ##
################################################################
print("### Levenshtein ###")
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            #dist = distance.hamming(x, y) # Hamming distance
            dist = levenshtein.distance(x, y) # Levenshtein v2 (faster than v1)
            #dist = levenshtein2.distance(x, y) # Levenshtein II
            #dist = damerau.distance(x, y) # Damerau-Levenshtein
            #dist = jarowinkler.similarity(x, y) # Jaro-Winkler v2 (faster than v1)

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()

print("Time needed:", end-start)
print(distMatrix)
print("\n")
np.savetxt("Results/" + file + "-Levenshtein.csv", distMatrix, fmt='%i', delimiter = ";")
times[1] = str(end-start)

################################################################
## Levenshtein II ##
################################################################
print("### Levenshtein II ###")
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            dist = levenshtein2.distance(x, y) # Levenshtein II
        
            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()

print("Time needed:", end-start)
print(distMatrix)
print("\n")
np.savetxt("Results/" + file + "-Levenshtein2.csv", distMatrix, fmt='%i', delimiter = ";")
times[2] = str(end-start)

################################################################
## Damerau-Levenshtein ##
################################################################
print("### Damerau ###")
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            dist = damerau.distance(x, y) # Damerau-Levenshtein

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()

print("Time needed:", end-start)
print(distMatrix)
print("\n")
np.savetxt("Results/" + file + "-Damerau.csv", distMatrix, fmt='%i', delimiter = ";")
times[3] = str(end-start)

################################################################
## Jaro-Winkler ##
################################################################
print("### Jaro-Winkler ###")
distMatrix = np.full((n, n), 0, dtype = np.byte)

start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            dist = jarowinkler.similarity(x, y) # Jaro-Winkler v2 (faster than v1)

            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T

end = datetime.now()

print("Time needed:", end-start)
print(distMatrix)
print("\n")
np.savetxt("Results/" + file + "-JaroWinkler.csv", distMatrix, fmt='%i', delimiter = ";")
times[4] = str(end-start)


################################################################
## End ##
################################################################

np.savetxt("Results/" + file + "-times.csv", times, fmt="%s", delimiter = ";")

### Hamming ###
Start: 2021-01-20 21:01:04.118112
Time needed: 1:48:50.166446
[[ 0  7 12 ... 12 12 12]
 [ 7  0 10 ... 15 15 15]
 [12 10  0 ... 18 18 18]
 ...
 [12 15 18 ...  0  2  2]
 [12 15 18 ...  2  0  3]
 [12 15 18 ...  2  3  0]]




MemoryError: Unable to allocate 137. MiB for an array with shape (11973, 11973) and data type int8

In [88]:
np.savetxt("Results/" + file + "-Hamming1.csv", distMatrix, fmt="%i", delimiter = ";")