## Segmentation of User Interaction Logs using Time Series Methods

A demo case used for the idea paper at CAISE 2025

In [None]:
import pandas as pd
import numpy as np
import stumpy
from stumpy import config

from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

import util.util

import matplotlib.pyplot as plt
import pm4py
import matplotlib.pyplot as plt
import seaborn as sns

import time
from datetime import datetime

#### Data Gathering

Read the Logs that should be discovered from the folder "Leno".

Adjust the file names according to your needs:
1. SRRT_Plus => The file that contains the sequentially ordered traces
2. SRRT_Parallel => The file that contains the alternating traces

In [4]:
file_path = 'Leno/'
# The following two files were used in the Experiment in the paper
srrt_plus_filename = "experiment_extended_SR_RT_joint.csv"
srrt_parallel_filename = "experiment_extended_SR_RT_parallel.csv"

text_encoding_method = "utf-8"
seperator = ";"

srrt_plus_log = pd.read_csv(file_path + srrt_plus_filename, encoding=text_encoding_method, sep=seperator)
srrt_parallel_log = pd.read_csv(file_path + srrt_parallel_filename, encoding=text_encoding_method, sep=seperator)

## 1. Word2Vec Encoding Single Window Size Discovery

Single Sentence Word2Vec encoding

> Im Paper Änderungen
1. Methodenbeschreibung anpassen > Word2Vec
2. A reduced set of parameters provides better results as described in Matrix Profil 4 paper, 
    a Method to identify the best set of parameters for mining the data has to be developed.
2. Multi-Dimensionale Motif discovery
3. Experiment mit dieser Encoding method durchführen und ergebnisskurve auswerten

In [None]:
# ---- Inputs ----
log_to_discover = srrt_plus_log
# OR
log_to_discover = srrt_parallel_log

window_size = 30
motifs_to_discover = 10


columns_to_use = ['eventType', 'targetApp', 'target.tagName', 'target.name']
#columns_to_use = ["targetApp","eventType","url","target.workbookName","target.sheetName","target.id","target.class","target.tagName","target.type","target.name","target.innerText","target.checked","target.href","target.option","target.title","target.innerHTML"]

# ---- Single Log execution ----
groundTruth = util.util.generate_caseid_list(log_to_discover) 
ui_log_encoded_w2v = util.util.encode_word2vec(srrt_plus_log, orderedColumnsList=columns_to_use)
motif_distances, motif_indices, motif_subspaces, motif_mdls = util.util.mine_w2v(ui_log_encoded_w2v,window_size,motifs_to_discover)
print(motif_indices)

print(f"Ground Truth:{np.sort(groundTruth)}")

# ---- Measuring Method: Half the Window Size ----
# insert_spots, motif_spots, overlapDF = util.util.compare_sets(set(groundTruth), set(motif_indices[0]), (size/2))
# ---- Measuring Method: Intersection over Union
insert_spots, motif_spots, overlapDF = util.util.compare_sets_IoU(set(groundTruth), set(motif_indices[0]), window_size, iou_threshold=0.7)

mean_iou = 0

if 'IoU' in overlapDF.columns:
    mean_iou = overlapDF.loc[:, "IoU"].mean()

ground_truth_sum = len(groundTruth)
true_positives = len(insert_spots)
false_positives = motifs_to_discover - true_positives  # Incorrectly identified motifs
false_negatives = ground_truth_sum - true_positives  # Relevant motifs not identified

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
print(f"Precision: {precision}")

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
print(f"Recall: {recall}")

# F1-Score
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"F1-Score: {f1_score}")

[[  80  240  321  402  643  724  805  965 1046 1209]]
Ground Truth:[   0   79  130  241  323  434  514  625  707  818  900 1011 1093 1204
 1284 1396 1538 1591 1671 1782 1864 1975 2057 2168 2248 2359 2441 2552
 2634 2745 2828 2939 3021 3135 3215 3326 3469 3522 3660 3715 3793 3904
 3986 4097 4180 4291 4373 4484 4566 4677 4757 4868 4948 5059 5141 5252
 5334 5445 5527 5638 5720 5833 5916 6027 6109 6220 6302 6413 6495 6606
 6688 6799 6882 6993 7075 7186 7268 7379 7461 7572 7654 7765 7847 7958
 8040 8152 8234 8345 8427 8538 8620 8731 8813 8925 9007 9118 9200 9311
 9393 9504]
Precision: 0.4
Recall: 0.04
F1-Score: 0.07272727272727272


  set_matches = set_matches._append(dict1, ignore_index=True)


In [None]:
# Create a figure with 1 row and 2 columns
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# --- Bar Chart ---
metrics = ['F1-Score', 'Recall', 'Precision', 'IoU']
values = [f1_score, recall, precision, mean_iou]
axes[0].bar(metrics, values, color='skyblue')
axes[0].set_ylim(0, 1)
axes[0].set_title('Performance Metrics', fontsize=14)
axes[0].set_ylabel('Score', fontsize=12)
for i, v in enumerate(values):
    axes[0].text(i, v + 0.02, f"{v:.2f}", ha='center', fontsize=10)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# --- Violin Plot with Points for IoU ---
sns.violinplot(y=overlapDF["IoU"], ax=axes[1], inner=None, color='lightblue')
sns.stripplot(y=overlapDF["IoU"], ax=axes[1], color='black', size=3, jitter=0.2)
axes[1].set_title("IoU Distribution (Violin Plot)", fontsize=14)
axes[1].set_ylabel("IoU", fontsize=12)
axes[1].grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'seaborn'

## 2. Automated Experiment for multiple window sizes

### Word2Vec Encoding Based

Iterates in the range of window size 25 to 66 over the time series encoded by with word2vec.

Can consider windowsize/2 or IoU threshold as quality value for measuring.


In [None]:
# ---- Inputs ----
log_to_discover = srrt_plus_log
log_to_discover_name = "SRRT_Plus"
# OR
log_to_discover = srrt_parallel_log
log_to_discover_name = "SRRT_Parallel"

ContextColumns = ["targetApp","eventType","url","target.workbookName","target.sheetName","target.id","target.class","target.tagName","target.type","target.name","target.innerText","target.checked","target.href","target.option","target.title","target.innerHTML"]
ContextColumns = ['eventType', 'targetApp', 'target.tagName', 'target.name']

NumberOfMotifsToDiscover = 10

# ---- Clean log ----
if "tuple:id" in log_to_discover.columns:
    log_to_discover = log_to_discover.drop(columns=["tuple:id"])

ui_log_encoded_w2v = util.util.encode_word2vec(srrt_plus_log, orderedColumnsList=ContextColumns)

groundTruth = util.uitl.generate_caseid_list(log_to_discover)
groundTruth_set = set(groundTruth)
ground_truth_sum = len(groundTruth)

# ---- Results container ----
results = []

# ---- Main loop ----
size = 25
discovery_repeat = False
while size <= 65:
    print(f"Processing size: {size}")
    motif_distances, motif_indices, motif_subspaces, motif_mdls = util.util.mine_w2v(ui_log_encoded_w2v,size,NumberOfMotifsToDiscover)

    discovered_set = set(motif_indices[0])
    if discovered_set or discovery_repeat:

        # insert_spots, motif_spots, overlapDF = util.util.compare_sets(groundTruth_set, discovered_set, (size/2))
        insert_spots, motif_spots, overlapDF = util.util.compare_sets_IoU(groundTruth_set, discovered_set, window_size=size, iou_threshold=0.5)

        if 'IoU' in overlapDF.columns:
            mean_iou = overlapDF.loc[:, "IoU"].mean()
        else:
            mean_iou = 0

        true_positives = len(insert_spots)
        false_positives = NumberOfMotifsToDiscover - true_positives
        false_negatives = ground_truth_sum - true_positives
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


        # print(f"Loop {size} completed. Results appending. F1-Score: {f1_score}, Precision: {precision}, Recall: {recall}, IuO: {mean_iou}.")
        results.append({
            'window_size': size,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'mean_iou': mean_iou,
            'discovered_indexes': motif_indices
        })
        size += 1
        discovery_repeat = False
    else:
        discovery_repeat = True


# ---- Save the results ----
results_df = pd.DataFrame(results)
timestamp = datetime.now().strftime("%y%m%d_%H%M")
results_df.to_csv(f'ExperimentResult/{timestamp}_w2v_iuo_{log_to_discover_name}.csv', index=False)
# print(results_df)

# ---- Plot F1-Score vs Window Size ----
plt.figure(figsize=(10, 6))
plt.plot(results_df['window_size'], results_df['f1_score'], marker='o', linewidth=2, label='F1-Score')
plt.plot(results_df['window_size'], results_df['recall'], marker='x', linewidth=2, label='Recall')
plt.plot(results_df['window_size'], results_df['precision'], marker='+', linewidth=2, label='Precision')
plt.plot(results_df['window_size'], results_df['mean_iou'], marker='s', linewidth=2, label='Mean IoU')

plt.title('Effect of Window Size on F1-Score', fontsize=16)
plt.xlabel('Window Size', fontsize=14)
plt.ylabel('F1-Score', fontsize=14)
plt.xticks(results_df['window_size'], rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.legend(fontsize=12)
# ---- Save to file ----
plt.savefig(f"ExperimentResult/{timestamp}_plot_{log_to_discover_name}.png", dpi=300, bbox_inches='tight')
plt.show()

Processing size: 25


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 25 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.9029304029304029.
Processing size: 26


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 26 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.9125615763546798.
Processing size: 27


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 27 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7590909090909091.
Processing size: 28


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 28 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7661708361505521.
Processing size: 29


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 29 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.9125448028673836.
Processing size: 30


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 30 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.8977272727272728.
Processing size: 31


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 31 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.868436465495289.
Processing size: 32


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 32 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.8136675020885548.
Processing size: 33


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 33 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.7396825396825397.
Processing size: 34


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 34 completed. Results appending. F1-Score: 0.03636363636363636, Precision: 0.2, Recall: 0.02, IuO: 0.9714285714285714.
Processing size: 35


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 35 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.6671393458120627.
Processing size: 36


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 36 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.6858215327688038.
Processing size: 37


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 37 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.739348508634223.
Processing size: 38


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 38 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.9658119658119658.
Processing size: 39


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 39 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7122484115597458.
Processing size: 40


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 40 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.5956753118360995.
Processing size: 41


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 41 completed. Results appending. F1-Score: 0.01818181818181818, Precision: 0.1, Recall: 0.01, IuO: 0.5769230769230769.
Processing size: 42


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 42 completed. Results appending. F1-Score: 0.03636363636363636, Precision: 0.2, Recall: 0.02, IuO: 0.5900000000000001.
Processing size: 43


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 43 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7412518037518037.
Processing size: 44


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 44 completed. Results appending. F1-Score: 0.03636363636363636, Precision: 0.2, Recall: 0.02, IuO: 0.5576441102756893.
Processing size: 45


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 45 completed. Results appending. F1-Score: 0.03636363636363636, Precision: 0.2, Recall: 0.02, IuO: 0.5662832929782082.
Processing size: 46


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 46 completed. Results appending. F1-Score: 0.03636363636363636, Precision: 0.2, Recall: 0.02, IuO: 0.6433811802232855.
Processing size: 47


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 47 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7482374292729796.
Processing size: 48


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 48 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7457431457431457.
Processing size: 49


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 49 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7671927914476208.
Processing size: 50


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 50 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7655292160021614.
Processing size: 51


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 51 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.7197223671850537.
Processing size: 52


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 52 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.766010412069306.
Processing size: 53


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 53 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7817663817663817.
Processing size: 54


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 54 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.7707297726070863.
Processing size: 55


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 55 completed. Results appending. F1-Score: 0.07272727272727272, Precision: 0.4, Recall: 0.04, IuO: 0.642391526798183.
Processing size: 56


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 56 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.739586524997941.
Processing size: 57


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 57 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.607091046277666.
Processing size: 58


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 58 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.7467642938968688.
Processing size: 59


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 59 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.74688422970344.
Processing size: 60


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 60 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.6522805832664987.
Processing size: 61


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 61 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.7614319629803501.
Processing size: 62


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 62 completed. Results appending. F1-Score: 0.09090909090909091, Precision: 0.5, Recall: 0.05, IuO: 0.759993223151118.
Processing size: 63


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 63 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.802262443438914.
Processing size: 64


  set_matches = set_matches._append(dict1, ignore_index=True)


Loop 64 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.7011358506069795.
Processing size: 65
Loop 65 completed. Results appending. F1-Score: 0.05454545454545455, Precision: 0.3, Recall: 0.03, IuO: 0.7277160982727136.


  set_matches = set_matches._append(dict1, ignore_index=True)


# 3. Archive

Old encoding method. Improved after reviewer feedback and experiment results in the notebook above.

## 3.1. Hot Encoding Method(s)

Based on the context columns the discovery will be executed

In [None]:
log_to_discover = srrt_plus_log
#log_to_discover = srrt_parallel_log

# ContextColumns = ["targetApp","eventType","url","target.workbookName","target.sheetName","target.id"] # Leno Attributes
ContextColumns = ["targetApp","eventType","url","target.workbookName","target.sheetName","target.id","target.class","target.tagName","target.type","target.name","target.innerText","target.checked","target.href","target.option","target.title","target.innerHTML"]

size = 25
NumberOfMotifsToDiscover = 25

# ---- Generating Ground Truth ----
groundTruth = util.util.generate_caseid_list(log_to_discover)

# ---- Reading the File ----
if "tuple:id" in log_to_discover.columns:
    log_to_discover = log_to_discover.drop(columns=["tuple:id"])
uiLog_Encoding_method = 3 # 1=Hierarchy Encoding, 2=Co-Occurrance Encoding, 3=Hot Encoding

uiLog = util.util.encoding_UiLog(log_to_discover,orderedColumnsList=ContextColumns,encoding=uiLog_Encoding_method)

# ---- Time Series Mining ----
tm_matrix, event_series = util.util.discover_motifs(uiLog, window_size=size, normalize=True)

print(f"event_series shape: {event_series.shape}")
print(f"tm_matrix shape: {tm_matrix.shape}")
print(f"tm_matrix[:,0] shape: {tm_matrix[:,0].shape}")

# The motifs have to be exclusive, thus, no one activity must be part of a routine already discovered
config.STUMPY_EXCL_ZONE_DENOM = 1  # The exclusion zone is i ± m
top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=NumberOfMotifsToDiscover, cutoff=5)
# Identify the outlier
discord_idx = np.argsort(tm_matrix[:, 0])[-1]
print(f"Discovered: {np.sort(top_motifs[1][0])}")

event_series shape: (9594,)
tm_matrix shape: (9570, 4)
tm_matrix[:,0] shape: (9570,)
Discovered: [   6   32  110  136  189  215  270  296  351  377  432  458  511  537
  593  673  699  754  780  835  861  914  940  995 1021]


## 3.2. Quality Measures

In [None]:
print(f"Ground Truth:{np.sort(groundTruth)}")

# For 1-dim time series
insert_spots, motif_spots, overlapDF = util.util.compare_sets(set(groundTruth), set(top_motifs[1][0]), (size/2))

ground_truth_sum = len(groundTruth)
true_positives = len(insert_spots)
false_positives = NumberOfMotifsToDiscover - true_positives  # Incorrectly identified motifs
false_negatives = ground_truth_sum - true_positives  # Relevant motifs not identified

# Precision
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
print(f"Precision: {precision}")

# Recall
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
print(f"Recall: {recall}")

# F1-Score
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"F1-Score: {f1_score}")

## 3.3. Automated Experiment for multiple window sizes
### A. Hot Encoding Based Multi Window Experiment

The following code tests multiple window sizes againts the encoding method and number of motifs to be discovered and visualizes the F1-Score trend afterwards.

In [None]:
# ---- Inputs ----
log_to_discover = srrt_plus_log
# log_to_discover = srrt_parallel_log

ContextColumns = ["targetApp","eventType","url","target.workbookName","target.sheetName","target.id","target.class","target.tagName","target.type","target.name","target.innerText","target.checked","target.href","target.option","target.title","target.innerHTML"]
ContextColumns = ['eventType', 'targetApp', 'target.tagName', 'target.name']


NumberOfMotifsToDiscover = 10
uiLog_Encoding_method = 3  # 1=Hierarchy Encoding, 2=Co-Occurrance Encoding, 3=Hot Encoding

# ---- Clean log ----
if "tuple:id" in log_to_discover.columns:
    log_to_discover = log_to_discover.drop(columns=["tuple:id"])

uiLog = util.util.encoding_UiLog(log_to_discover, orderedColumnsList=ContextColumns, encoding=uiLog_Encoding_method)


groundTruth = util.util.generate_caseid_list(log_to_discover)
groundTruth_set = set(groundTruth)
ground_truth_sum = len(groundTruth)

# ---- Results container ----
results = []

# ---- Main loop ----
for size in range(25, 66):
    uiLog = util.util.encoding_UiLog(log_to_discover, orderedColumnsList=ContextColumns, encoding=uiLog_Encoding_method)
    tm_matrix, event_series = util.util.discover_motifs(uiLog, window_size=size, normalize=True)
    
    config.STUMPY_EXCL_ZONE_DENOM = 1
    top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:, 0], min_neighbors=1, max_matches=NumberOfMotifsToDiscover, cutoff=5)
    
    discovered_set = set(top_motifs[1][0])
    insert_spots, motif_spots, overlapDF = util.util.compare_sets(groundTruth_set, discovered_set, (size/2))
    # insert_spots, motif_spots, overlapDF = util.util.compare_sets_IoU(groundTruth_set, discovered_set, window_size=size, iou_threshold=0.5)

    true_positives = len(insert_spots)
    false_positives = NumberOfMotifsToDiscover - true_positives
    false_negatives = ground_truth_sum - true_positives
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    results.append({
        'window_size': size,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    })

    # print("Execution completed for window size:" + str(size))

# ---- Save the results ----
results_df = pd.DataFrame(results)
results_df.to_csv('motif_discovery_window_size_experiment.csv', index=False)
# print(results_df)

# ---- Plot F1-Score vs Window Size ----
plt.figure(figsize=(10, 6))
plt.plot(results_df['window_size'], results_df['f1_score'], marker='o', linewidth=2)

plt.title('Effect of Window Size on F1-Score', fontsize=16)
plt.xlabel('Window Size', fontsize=14)
plt.ylabel('F1-Score', fontsize=14)
plt.xticks(results_df['window_size'], rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## 4. Visualisation

Not Maintained

In [None]:
# ---- DFG Mining ----
caseuiLog = util.util.reduceLogToDiscovered(uiLog,top_motifs[1][0],size)
end_time = time.time()

cols = ["targetApp","eventType"]
caseuiLog["concept:name"] = caseuiLog[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
caseuiLog["time:timestamp"] = caseuiLog["timeStamp"]
caseuiLog['case:concept:name'] = caseuiLog['case:concept:name'].astype('int64')
pm4pyDf = pm4py.format_dataframe(caseuiLog)
uiLogDFG, start_activities, end_activities = pm4py.discover_dfg(pm4pyDf)
pm4py.view_dfg(uiLogDFG, start_activities, end_activities)

# ---- Motif Visualisation ----
starting_row = 0
ending_row = len(uiLog)-1
ids = uiLog.loc[starting_row:ending_row,'tuple:id'].tolist()
rows = [i for i in range(len(uiLog.loc[starting_row:ending_row,'tuple:id']))]

#Plot Event data
fig2, axs2 = plt.subplots(3, sharex=True, gridspec_kw={'hspace': 0})
plt.suptitle('Motif (Routine) Discovery', fontsize='10')

axs2[0].scatter(rows, ids, alpha=0.8)
axs2[0].set_ylabel('Events', fontsize='10')
# Plot Timeseries data
axs2[1].plot(event_series)
axs2[1].set_ylabel('Timeseries', fontsize='10')
# Plot Matrix profiles
axs2[2].set_xlabel('Activity', fontsize ='10')
axs2[2].set_ylabel('Matrix Profile', fontsize='10')
axs2[2].set_ylim(top=tm_matrix[:, 0].max()*1.1) #displaying the max value with some uplift for space in Graph
axs2[2].plot(tm_matrix[:, 0])
# Adding Dashed lines
for discovered in top_motifs[1][0]:
    axs2[0].axvline(x=discovered, linestyle="dashed",color='C1')
    #axs2[1].axvline(x=discovered, linestyle="dashed",color='C1')
    axs2[2].axvline(x=discovered, linestyle="dashed",color='C1')

# Display Pattern overlay
fig, ax = plt.subplots(figsize=(6.5, 2))
plt.title('Motif Overlay', fontsize='10')
ax.set_xlabel("Events", fontsize='10')
ax.set_ylabel("Timeseries", fontsize='10')
# Plot motif and nearest neighbor window
for i, val in enumerate(top_motifs[1][0]):
    colorPlot = 'C' + str(i)
    ax.plot(event_series[val:val+size], color=colorPlot, label=f"Motif {i}")
    
plt.legend(loc="best",fontsize=10)
plt.show()