In [6]:
import pm4py
from collections import Counter

from sklearn.cluster import DBSCAN, KMeans

log_path = "logs/running-example.jsonocel"
ocel = pm4py.read_ocel_json(log_path)

In [7]:
object_types = list(set(ocel.objects["ocel:type"]))
object_type_to_id = {k:i for i, k in enumerate(object_types)}
object_type_to_id

{'items': 0, 'orders': 1, 'products': 2, 'customers': 3, 'packages': 4}

In [8]:
ocel.objects[ocel.objects["ocel:type"]=="customers"]

Unnamed: 0,ocel:oid,ocel:type,cost,producer,age,bankaccount
20,Marco Pegoraro,customers,,,50.0,91248.0
21,Gyunam Park,customers,,,55.0,27275.0
22,Majid Rafiei,customers,,,46.0,74370.0
23,Junxiong Gao,customers,,,52.0,96270.0
24,Seran Uysal,customers,,,40.0,69940.0
25,Christina Rensinghof,customers,,,41.0,17020.0
26,Wil van der Aalst,customers,,,34.0,48846.0
27,Christine Dobbert,customers,,,43.0,59702.0
28,Luis Santos,customers,,,48.0,37644.0
29,Kefang Ding,customers,,,38.0,76896.0


In [9]:
activity_to_numeric = {k:i for i, k in enumerate(list(set(ocel.events["ocel:activity"])))}
activity_to_numeric

{'reorder item': 0,
 'item out of stock': 1,
 'failed delivery': 2,
 'create package': 3,
 'confirm order': 4,
 'send package': 5,
 'payment reminder': 6,
 'place order': 7,
 'pay order': 8,
 'package delivered': 9,
 'pick item': 10}

In [10]:
import numpy as np
x = []
event_ids = list(ocel.events["ocel:eid"])
for eid in event_ids:
    eid_num = float(eid)
    if eid_num%1000==0:
        print(eid_num)
    counted = Counter(ocel.relations[ocel.relations["ocel:eid"]==eid]["ocel:type"])
    event_vec = np.zeros(len(object_type_to_id)+1)
    for k in counted.keys():
        vec_id = object_type_to_id[k]
        event_vec[vec_id] += counted[k]
    activity = list(ocel.events[ocel.events["ocel:eid"]==eid]["ocel:activity"])[0]
    event_vec[len(object_type_to_id)] = activity_to_numeric[activity]
    x.append(event_vec)
x = np.array(x)

1000.0
2000.0
3000.0
4000.0
5000.0
6000.0
7000.0
8000.0
9000.0
10000.0
11000.0
12000.0
13000.0
14000.0
15000.0
16000.0
17000.0
18000.0
19000.0
20000.0
21000.0
22000.0


In [20]:
from clustering_api import ClusteringOCEL

dbscan1 = DBSCAN(eps=0.2, min_samples=5)
dbscan2 = DBSCAN(eps=0.5, min_samples=5)
kmeans = KMeans(n_clusters=6, init="k-means++")
ocel_dbscan1 = ClusteringOCEL(model=dbscan1, ocel=ocel, dataset_from_ocel=x)
ocel_dbscan2 = ClusteringOCEL(model=dbscan2, ocel=ocel, dataset_from_ocel=x)
ocel_kmeans = ClusteringOCEL(model=kmeans, ocel=ocel, dataset_from_ocel=x)

In [21]:
ocel_dbscan1.fit(normalize=True)
print(ocel_dbscan1.unique_labels())

[0 1 2 3]


In [22]:
ocel_kmeans.fit(normalize=True)
print(ocel_kmeans.unique_labels())

[0 1 2 3 4 5]




In [40]:
from collections import defaultdict
clusters = defaultdict(list)
labels = ocel_dbscan1.model.labels_
for i in range(labels.size):
    clusters[labels[i]].append(ocel_dbscan1.ocel.events.iloc[i]["ocel:eid"])

In [41]:
for k, v in clusters.items():
    print(k, v[:10])

0 ['1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '9.0', '11.0', '12.0']
1 ['8.0', '10.0', '23.0', '37.0', '49.0', '56.0', '63.0', '64.0', '69.0', '76.0']
2 ['26.0', '52.0', '89.0', '94.0', '98.0', '102.0', '114.0', '130.0', '131.0', '133.0']
3 ['1488.0', '1557.0', '1612.0', '1656.0', '1687.0', '2372.0', '2404.0', '2456.0', '2551.0', '10352.0']


In [54]:
ocel.relations[ocel.relations["ocel:eid"]=="1488.0"]

Unnamed: 0,ocel:eid,ocel:activity,ocel:timestamp,ocel:oid,ocel:type,ocel:qualifier
11595,1488.0,failed delivery,2019-06-17 14:18:32,990119,orders,
11596,1488.0,failed delivery,2019-06-17 14:18:32,iPad mini,products,
11597,1488.0,failed delivery,2019-06-17 14:18:32,Marco Pegoraro,customers,
11598,1488.0,failed delivery,2019-06-17 14:18:32,880486,items,
11599,1488.0,failed delivery,2019-06-17 14:18:32,660078,packages,
