In [164]:
import pm4py
from collections import Counter

from sklearn.cluster import DBSCAN, KMeans

# log_path = "logs/running-example.jsonocel"
# clusters_path = "logs/running-example-clusters.json"

# log_path = "logs/recruiting.jsonocel"
# clusters_path = "logs/recruiting-clusters.json"

log_path = "logs/transfer_order.jsonocel"
clusters_path = "logs/transfer_order-clusters.json"

ocel = pm4py.read_ocel_json(log_path)

In [165]:
object_types = list(set(ocel.objects["ocel:type"]))
object_type_to_id = {k:i for i, k in enumerate(object_types)}
object_type_to_id

{'MATNR': 0, 'NLQNR': 1, 'QNAME': 2, 'WERKS': 3, 'NLPLA': 4}

In [166]:
ocel.objects[ocel.objects["ocel:type"]=="customers"]

Unnamed: 0,ocel:oid,ocel:type


In [167]:
import numpy as np
x = []
event_ids = list(ocel.events["ocel:eid"])
for eid in event_ids:
    eid_num = float(eid)
    if eid_num%1000==0:
        print(eid_num)
    counted = Counter(ocel.relations[ocel.relations["ocel:eid"]==eid]["ocel:type"])
    event_vec = np.zeros(len(object_type_to_id))
    for k in counted.keys():
        vec_id = object_type_to_id[k]
        if counted[k] > 0:
            event_vec[vec_id] = 1
    x.append(event_vec)
x = np.array(x)

0.0
1000.0
2000.0
3000.0
4000.0
5000.0
6000.0
7000.0
8000.0
9000.0
10000.0


In [168]:
from clustering_api import ClusteringOCEL

dbscan1 = DBSCAN(eps=0.1, min_samples=5)
dbscan2 = DBSCAN(eps=0.5, min_samples=5)
kmeans = KMeans(n_clusters=6, init="k-means++")
ocel_dbscan1 = ClusteringOCEL(model=dbscan1, ocel=ocel, dataset_from_ocel=x)
ocel_dbscan2 = ClusteringOCEL(model=dbscan2, ocel=ocel, dataset_from_ocel=x)
ocel_kmeans = ClusteringOCEL(model=kmeans, ocel=ocel, dataset_from_ocel=x)

In [169]:
ocel_dbscan1.fit(normalize=True)
print(ocel_dbscan1.unique_labels())

[0 1]


In [170]:
from collections import defaultdict
clusters = defaultdict(list)
labels = ocel_dbscan1.model.labels_
for i in range(labels.size):
    clusters[labels[i]].append(ocel_dbscan1.ocel.events.iloc[i]["ocel:eid"])

In [171]:
for k, v in clusters.items():
    print(k, v[:10])

0 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
1 ['602', '603', '614', '615', '1234', '7742', '7868', '7994', '8120', '8246']


In [172]:
ocel.relations[ocel.relations["ocel:eid"]=="8.0"]

Unnamed: 0,ocel:eid,ocel:activity,ocel:timestamp,ocel:oid,ocel:type,ocel:qualifier


In [173]:
for k, v in clusters.items():
    print(f"cluster {k}, number of events {len(v)}")

cluster 0, number of events 10298
cluster 1, number of events 21


In [174]:
separate_plots = []
plot_data = {}
object_types = list(set(ocel.objects["ocel:type"]))
object_types_to_id = {object_type: i  for i, object_type in enumerate(object_types)}

plot_no = 0
threes = 0

for k, v in clusters.items():
    if threes == 3:
         plot_no += 1
         threes = 0
    if len(separate_plots)-1 < plot_no:
         separate_plots.append({})
    separate_plots[plot_no][k] = np.zeros(len(object_types))
    for eid in v:
        counted = Counter(ocel.relations[ocel.relations["ocel:eid"]==eid]["ocel:type"])
        for c_key in counted.keys():
            if counted[c_key] > 0:
                    separate_plots[plot_no][k][object_types_to_id[c_key]]=1
    threes += 1

In [175]:
import plotly.graph_objects as go
from plotly.graph_objects import Line

categories = [*object_types, object_types[0]]
for new_plot in separate_plots:
  data = []
  cluster_count = 4
  for k, v in new_plot.items():
      print(f'Cluster {k} -> {v}\n{object_types}')
      used_objects = []
      for i in range(len(v)):
         if v[i] == 1:
          used_objects.append(object_types[i])
      values = [*v, v[0]]
      data.append(
         go.Scatterpolar(
          r=values,
          theta=categories,
          fill='toself',
          name=f'Cluster {k} -> {used_objects}',
          line=Line(width=cluster_count)
      ))
      cluster_count -= 1
  fig = go.Figure(
     data,
     layout=go.Layout(
        title=go.layout.Title(text='Clusters comparison'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
  )
  fig.show()


Cluster 0 -> [1. 1. 1. 1. 1.]
['MATNR', 'NLQNR', 'QNAME', 'WERKS', 'NLPLA']
Cluster 1 -> [1. 1. 0. 1. 1.]
['MATNR', 'NLQNR', 'QNAME', 'WERKS', 'NLPLA']



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [176]:
import json

dump_clusters = {}
for k, v in clusters.items():
    dump_clusters[int(k)] = v
with open(clusters_path, "w") as outfile:
    json.dump(dump_clusters, outfile)