In [1]:
## import all necessary libraries ##
import distance

from similarity.levenshtein import Levenshtein
levenshtein = Levenshtein()

from similarity.damerau import Damerau
damerau = Damerau()

from pyjarowinkler import distance as jwdistance
from similarity.jarowinkler import JaroWinkler
jarowinkler = JaroWinkler()

from similarity.weighted_levenshtein import WeightedLevenshtein
from similarity.weighted_levenshtein import CharacterSubstitutionInterface
import math
class CharacterSubstitution(CharacterSubstitutionInterface):
    def cost(self, c0, c1):
        return math.inf # assign inifte weight to all substitutions
levenshtein2 = WeightedLevenshtein(CharacterSubstitution())

#########################

import numpy as np
from datetime import datetime
import random 

#########################

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter

In [85]:
## import dataset ##
file = "BPIC20"
log = xes_importer.apply("../Datasets/"+ file + ".xes") # adjust for local file location

parsing log, completed traces ::   0%|          | 0/7065 [00:00<?, ?it/s]

In [86]:
## generate mapping from activities to chars ##

# extract all activities from log and convert them into list
activities = list(attributes_filter.get_attribute_values(log, "concept:name").keys())

transl = {} # dictionary mapping the names of the activities to chars
for i, a in enumerate(activities):
    transl[a] = chr(i+1)

def list_to_string(trace):
    string = ""
    for e in trace:
        string = string + transl[e] 
    return string

In [87]:
## generate list of strings for all variants ##

variants = list(variants_filter.get_variants(log).keys())
strings = [] # list of strings for each variant
variant_to_index = {} # dicitionary to translate variant to index in list for later lookup of traces

# generate strings
for i, v in enumerate(variants):
    strings.append(list_to_string(v.split(",")))
    variant_to_index[v] = i

# translation function: trace to index
def trace_to_index(trace):
    # convert trace to string representation of variant (concept:name separated by commas)
    trace_string = ""
    for e in trace:
        trace_string = trace_string + e["concept:name"] + ","
    trace_string = trace_string[:-1] # remove last comma
    
    # lookup string representation of variant
    return variant_to_index[trace_string]
        
print("First 5 strings:", strings[:5])
print("No. of strings:", len(strings))

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08', '\x01\x02\t', '\x01\x03\n\x0b\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x07\x07\x07\x08\x08\x08\x05\x0c\x06\x07\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x05\x0c\x06\x0c\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x06\x05\x0c\x05\x0c\x06\x06\x07\x07\x07\x07\x07\x07\x08\x08\x08\x08\x08\x08\x07\x08\x05\x0c\x06\x07\x08\x02\x05\r\x06\x07\x08', '\x01\x02\x03\x0e\x04\x05\x0f\x10\x05\x0c\x06\x07\x08', '\x03\x04\x01\x02\x11']
No. of strings: 1478


In [42]:
## optional padding when using Hamming distance ##

count = 0
for s in strings:
    if len(s) > count:
        count = len(s)

strings_pad = []

for s in strings:
    strings_pad.append(s + chr(0) * (count - len(s)))

strings = strings_pad

print("First 5 strings:", strings_pad[:5])

First 5 strings: ['\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', '\x01\x02\t\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', '\x01\x03\n\x0b\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x07\x07\x07\x08\x08\x08\x05\x0c\x06\x07\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x05\x0c\x06\x0c\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x06\x05\x0c\x05\x0c\x06\

In [61]:
x = strings[0]
y = strings[1]

x_split = round(len(x)/2)
x1 = x[:x_split]
x2 = x[x_split:]
y_split = round(len(y)/2)
y1 = y[y_split:]
y2 = y[:y_split]

print(len(y1))
print(len(y2))

45
45


# Calculation with string split

In [83]:
## calculation of distance matrix ##

#strings = strings_pad # optional use of padded strings for Hamming
n = len(strings)
distMatrix = np.full((n, n), 0, dtype = np.byte)
start = datetime.now()
print("Start:", start)

for i, x in enumerate(strings):
    for j, y in enumerate(strings):
        if j >= i: # only calculate upper right triangle of matrix
            x_split = round(len(x)/2)
            x1 = x[:x_split]
            x2 = x[x_split:]
            y_split = round(len(y)/2)
            y1 = y[:y_split]
            y2 = y[y_split:]

            dist1 = distance.hamming(x1, y1) # Hamming distance
            dist2 = distance.hamming(x2, y2)
            
            #dist1 = levenshtein.distance(x1, y1) # Levenshtein v2 (faster than v1)
            #dist2 = levenshtein.distance(x2, y2)
            
            #dist1 = levenshtein2.distance(x1, y1) # Levenshtein II
            #dist2 = levenshtein2.distance(x2, y2)

            #dist1 = damerau.distance(x1, y1) # Damerau-Levenshtein
            #dist2 = damerau.distance(x2, y2)
            
            #dist1 = jarowinkler.similarity(x1, y1) * 100 / 2 # Jaro-Winkler v2 (faster than v1)
            #dist2 = jarowinkler.similarity(x2, y2) * 100 /2

            dist = dist1 + dist2
            distMatrix[i][j] = dist

# mirror upper right triangle of matrix by adding the transposition
distMatrix = distMatrix + distMatrix.T - np.diag(np.diag(distMatrix)) 

end = datetime.now()


print("Time needed:", end-start)

np.savetxt("Results/Split/Split-" + file + "-Hamming.csv", distMatrix, fmt='%i', delimiter = ";")
print(distMatrix)

Start: 2021-01-24 14:30:17.352365
Time needed: 0:00:20.198630
[[ 0  6 88 ... 13 11  8]
 [ 6  0 89 ... 13 12  6]
 [88 89  0 ... 90 90 90]
 ...
 [13 13 90 ...  0 11 11]
 [11 12 90 ... 11  0 10]
 [ 8  6 90 ... 11 10  0]]


'\x01\x03\n\x0b\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x07\x07\x07\x08\x08\x08\x05\x0c\x06\x07\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x0c\x05\x0c\x06\x06\x07\x07\x08\x08\x05\x05\x0c\x06\x0c\x05\x0c\x05\x0c\x05\x0c\x06\x06\x06\x06\x05\x0c\x05\x0c\x06\x06\x07\x07\x07\x07\x07\x07\x08\x08\x08\x08\x08\x08\x07\x08\x05\x0c\x06\x07\x08\x02\x05\r\x06\x07\x08'

# All

In [None]:
## calculation of distance matrix ##

for m in range(1, 5):
    #strings = strings_pad # optional use of padded strings for Hamming
    n = len(strings)
    distMatrix = np.full((n, n), 0, dtype = np.byte)

    start = datetime.now()
    print("Start:", start)

    for i, x in enumerate(strings):
        for j, y in enumerate(strings):
            if j >= i: # only calculate upper right triangle of matrix
                x_split = round(len(x)/2)
                x1 = x[:x_split]
                x2 = x[x_split:]
                y_split = round(len(y)/2)
                y1 = y[:y_split]
                y2 = y[y_split:]
                
                if m == 0: # skipped
                    method = "Hamming"
                    dist1 = distance.hamming(x1, y1) # Hamming distance
                    dist2 = distance.hamming(x2, y2)
                if m == 1:
                    method = "Levenshtein"
                    dist1 = levenshtein.distance(x1, y1) # Levenshtein v2 (faster than v1)
                    dist2 = levenshtein.distance(x2, y2)

                if m == 2:
                    method = "Levenshtein2"
                    dist1 = levenshtein2.distance(x1, y1) # Levenshtein II
                    dist2 = levenshtein2.distance(x2, y2)
                
                if m == 3:
                    method = "Damerau"
                    dist1 = damerau.distance(x1, y1) # Damerau-Levenshtein
                    dist2 = damerau.distance(x2, y2)

                if m == 4:
                    method = "JaroWinkler"
                    dist1 = jarowinkler.similarity(x1, y1) * 100 / 2 # factor 100 scaling for byte storage. factor 0.5 for jaro winkler scaling
                    dist2 = jarowinkler.similarity(x2, y2) * 100 / 2

                dist = dist1 + dist2
                distMatrix[i][j] = dist

    # mirror upper right triangle of matrix by adding the transposition
    distMatrix = distMatrix + distMatrix.T

    end = datetime.now()


    print("Time needed:", end-start)

    np.savetxt("Results/Split/Split-" + file + "-" + method + ".csv", distMatrix, fmt='%i', delimiter = ";")
    print(distMatrix)

Start: 2021-01-24 14:32:40.108029
Time needed: 0:02:23.402584
[[ 0  6 85 ... 11 11  8]
 [ 6  0 89 ... 13 12  6]
 [85 89  0 ... 85 86 88]
 ...
 [11 13 85 ...  0  8 10]
 [11 12 86 ...  8  0  9]
 [ 8  6 88 ... 10  9  0]]
Start: 2021-01-24 14:35:04.313969
Time needed: 0:04:38.914714
[[ 0  7 86 ... 13 14 12]
 [ 7  0 91 ... 16 15  7]
 [86 91  0 ... 93 94 92]
 ...
 [13 16 93 ...  0 13 13]
 [14 15 94 ... 13  0 12]
 [12  7 92 ... 13 12  0]]
Start: 2021-01-24 14:39:44.067759
