In [1]:
import pm4py
import os
os.chdir('../')
!pwd

/home/vinh/project/experiments-drift-detection


In [41]:
from src.utils.helpers import _getActivityNames, _getActivityNames_LogList, makeProgressBar
import math 
import numpy as np
import scipy.stats as stats
from typing import List, Tuple 
from scipy.signal import find_peaks
from enum import Enum
from pm4py.objects.log.obj import EventLog
import pm4py.util.xes_constants as xes 
from tqdm import tqdm
from collections import defaultdict

In [155]:
log = pm4py.read_xes('data/sudden_trace_noise0_1000_IOR.xes')

 ... (more hidden) ...


In [156]:
def extract_caseid_activity(event_log: EventLog) -> List[Tuple[str, str]]:

    caseid_activity_pairs = []
    
    for trace in event_log:
        case_id = trace.attributes.get('concept:name', str(id(trace)))
        
        for event in trace:
            activity = event.get('concept:name', 'Unknown Activity')
            caseid_activity_pairs.append((case_id, activity))
    
    return caseid_activity_pairs

In [157]:
res = extract_caseid_activity(log)

In [158]:
res

[('0', 'A'),
 ('0', 'B'),
 ('0', 'C'),
 ('0', 'A'),
 ('0', 'D'),
 ('0', 'F'),
 ('0', 'E'),
 ('0', 'G'),
 ('0', 'I'),
 ('0', 'J'),
 ('0', 'K'),
 ('0', 'M'),
 ('0', 'O'),
 ('1', 'A'),
 ('1', 'D'),
 ('1', 'F'),
 ('1', 'E'),
 ('1', 'G'),
 ('1', 'I'),
 ('1', 'J'),
 ('1', 'L'),
 ('1', 'M'),
 ('1', 'N'),
 ('10', 'A'),
 ('10', 'B'),
 ('10', 'C'),
 ('10', 'A'),
 ('10', 'D'),
 ('10', 'F'),
 ('10', 'E'),
 ('10', 'G'),
 ('10', 'H'),
 ('100', 'A'),
 ('100', 'D'),
 ('100', 'F'),
 ('100', 'E'),
 ('100', 'G'),
 ('100', 'I'),
 ('100', 'J'),
 ('100', 'L'),
 ('100', 'M'),
 ('100', 'O'),
 ('101', 'A'),
 ('101', 'B'),
 ('101', 'C'),
 ('101', 'A'),
 ('101', 'B'),
 ('101', 'C'),
 ('101', 'A'),
 ('101', 'B'),
 ('101', 'C'),
 ('101', 'A'),
 ('101', 'B'),
 ('101', 'C'),
 ('101', 'A'),
 ('101', 'D'),
 ('101', 'F'),
 ('101', 'E'),
 ('101', 'G'),
 ('101', 'I'),
 ('101', 'J'),
 ('101', 'K'),
 ('101', 'M'),
 ('101', 'O'),
 ('102', 'A'),
 ('102', 'B'),
 ('102', 'C'),
 ('102', 'A'),
 ('102', 'B'),
 ('102', 'C'),
 ('10

In [159]:
def _group_events_by_case(events: List[Tuple[str, str]]) -> Dict[str, List[str]]:
    """
    Group incoming event stream into cases (traces).
    Args:
        events: List of (case_id, activity)
    Returns:
        dict: {case_id: [activity_1, ..., activity_n]}
    """
    grouped = defaultdict(list)
    for cid, act in events:
        grouped[cid].append(act)
    return grouped

In [160]:
res1 = _group_events_by_case(res)

In [161]:
import uuid
import numpy as np
from typing import List, Dict, Tuple
from collections import defaultdict
from tqdm import tqdm
from river import drift

# ============================================================
#  PrefixCDD - Effective Online Concept Drift Detection
# ============================================================

# ------------------------------------------------------------
# Prefix Tree Structures
# ------------------------------------------------------------

class PrefixNode:
    """A node in a prefix tree (stores activity label, frequency, and children)."""
    def __init__(self, activity: str, parent=None):
        self.id = uuid.uuid4()
        self.activity = activity
        self.parent = parent
        self.children: Dict[str, 'PrefixNode'] = {}
        self.freq = 0

    def add_child(self, activity: str):
        """Add or return an existing child node for the given activity."""
        if activity not in self.children:
            self.children[activity] = PrefixNode(activity, parent=self)
        return self.children[activity]


class PrefixTree:
    """A prefix tree that represents a set of traces (process executions)."""
    def __init__(self):
        self.root = PrefixNode("root")

    def add_trace(self, trace: List[str]):
        """Insert a sequence of activities into the tree."""
        node = self.root
        for act in trace:
            node = node.add_child(act)
            node.freq += 1

    def get_node_freqs(self) -> Dict[str, int]:
        """Return mapping (activity -> frequency) of all nodes in the tree."""
        result = defaultdict(int)
        def traverse(node: PrefixNode):
            for child in node.children.values():
                result[child.activity] += child.freq
                traverse(child)
        traverse(self.root)
        return dict(result)

# ------------------------------------------------------------
# Tree Distance Metric (δ)
# ------------------------------------------------------------

def compute_tree_distance(t1: PrefixTree, t2: PrefixTree) -> float:
    """
    Compute δ distance metric between two prefix trees.
    Combines structural and frequency differences.
    """
    f1 = t1.get_node_freqs()
    f2 = t2.get_node_freqs()
    all_acts = set(f1.keys()).union(f2.keys())
    diff = 0.0
    for act in all_acts:
        diff += (f1.get(act, 0) - f2.get(act, 0)) ** 2
    return np.sqrt(diff)

In [162]:
def build_tree_from_events(events: List[Tuple[str, str]]) -> PrefixTree:
    """Build a prefix tree from a batch of (case, activity) events."""
    tree = PrefixTree()
    cases = _group_events_by_case(events)
    for trace in cases.values():
        tree.add_trace(trace)
    return tree

In [163]:
class PrefixCDD:
    """
    PrefixCDD: Online concept drift detection over event streams using prefix trees.
    Implements the approach from Huete et al. (2023).
    """

    def __init__(self, num_trees: int = 12, events_per_tree: int = 500, show_progress: bool = True):
        """
        Args:
            num_trees: Number of prefix trees per window (W = k × j)
            events_per_tree: Number of events per tree
            show_progress: Display progress bar while running
        """
        self.num_trees = num_trees
        self.events_per_tree = events_per_tree
        self.trees: List[PrefixTree] = []
        self.current_events: List[Tuple[str, str]] = []
        self.adwin = drift.ADWIN(delta = 0.2)
        self.detected_drifts: List[int] = []
        self.show_progress = show_progress
        self.total_events = 0
        self.drift_info = []

    # --------------------------------------------------------
    # Main Interface
    # --------------------------------------------------------
    def add_event(self, case_id: str, activity: str):
        """Add one event to PrefixCDD and check for drift."""
        self.current_events.append((case_id, activity))
        self.total_events += 1  # <-- track total events
        
        if len(self.current_events) >= self.events_per_tree:
            new_tree = build_tree_from_events(self.current_events)
            self.trees.append(new_tree)
            self.current_events = []
    
            if len(self.trees) > self.num_trees:
                self.trees.pop(0)
    
            if len(self.trees) >= 2:
                distance = compute_tree_distance(self.trees[-2], self.trees[-1])
                # pass case_id, activity to the detector
                self._update_drift_detector(distance, case_id, activity)


    def _update_drift_detector(self, distance: float, case_id: str, activity: str):
        """Feed ADWIN with distance and detect drift."""
        self.adwin.update(distance)
        if self.adwin.drift_detected:
            drift_index = len(self.trees)
            self.detected_drifts.append(drift_index)
            drift_event_index = self.total_events
            self.drift_info.append({
                "tree": drift_index,
                "event_index": drift_event_index,
                "case_id": case_id,
                "activity": activity,
                "distance": distance
            })
            print(f"[Drift] Tree {drift_index}, Event {drift_event_index}, "
                  f"Case {case_id}, Activity '{activity}', δ={distance:.4f}")



    # --------------------------------------------------------
    # Offline Evaluation
    # --------------------------------------------------------
    def process_stream(self, event_stream: List[Tuple[str, str]]):
        """
        Process a full event stream (list of (case_id, activity)).
        This is a convenience function for batch testing.
        """
        if self.show_progress:
            iterator = tqdm(event_stream, desc="Processing event stream")
        else:
            iterator = event_stream

        for cid, act in iterator:
            self.add_event(cid, act)

        return self.detected_drifts


In [164]:
model = PrefixCDD(num_trees=20, events_per_tree=50)

In [165]:
event_stream = [(cid, act) for cid, acts in res1.items() for act in acts]
detected = model.process_stream(event_stream)

 ... (more hidden) ...

[Drift] Tree 20, Event 8050, Case 757, Activity 'C', δ=2.8284





In [166]:
res1

defaultdict(list,
            {'0': ['A',
              'B',
              'C',
              'A',
              'D',
              'F',
              'E',
              'G',
              'I',
              'J',
              'K',
              'M',
              'O'],
             '1': ['A', 'D', 'F', 'E', 'G', 'I', 'J', 'L', 'M', 'N'],
             '10': ['A', 'B', 'C', 'A', 'D', 'F', 'E', 'G', 'H'],
             '100': ['A', 'D', 'F', 'E', 'G', 'I', 'J', 'L', 'M', 'O'],
             '101': ['A',
              'B',
              'C',
              'A',
              'B',
              'C',
              'A',
              'B',
              'C',
              'A',
              'B',
              'C',
              'A',
              'D',
              'F',
              'E',
              'G',
              'I',
              'J',
              'K',
              'M',
              'O'],
             '102': ['A',
              'B',
              'C',
              'A',
            

In [38]:
for trace in log:
    for _ in trace:
        time_list.append(_['time:timestamp'])
        act_list.append(_['concept:name'])


In [39]:
len(time_list)

58838

In [40]:
min(time_list)

datetime.datetime(1970, 1, 1, 1, 6, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)))

In [41]:
max(time_list)

datetime.datetime(1981, 1, 13, 4, 35, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)))

In [42]:
act = list(set(act_list))

In [70]:
def _getCausualFootprint(log:EventLog, activities:List[str]=None, activityName_key:str=xes.DEFAULT_NAME_KEY)-> np.chararray:
    if activities is None:
        activities = _getActivityNames(log, activityName_key=activityName_key)
    d = {(act1, act2): '%' for act1 in activities for act2 in activities}
    
    for trace in log:
        seen = set()
        A_touched = set()
        for event in trace:
            name = event[activityName_key]
            for s in seen:
                valnow = d[(s, name)]
                if valnow in ['%', 'A']:
                    d[(s,name)] = 'A'
                    A_touched.add((s,name))
                elif valnow == 'N':
                    d[(s, name)] = 'S'
            seen.add(name)
        for act1 in seen: 
            for act2 in activities:
                if d[(act1, act2)] == 'A' and (act1, act2) not in A_touched:
                    d[(act1, act2)] = 'S'
                elif d[(act1, act2)] == '%':
                    d[(act1, act2)] = 'N' 
    output = np.chararray((len(activities), len(activities)), unicode = True)
    for act1 in activities:
        i1 = activities.index(act1)
        for act2 in activities:
            i2 = activities.index(act2)
            output[i1][i2] = d[(act1, act2)] if d[(act1, act2)] != '%' else 'N'
    return output

  def _getCausualFootprint(log:EventLog, activities:List[str]=None, activityName_key:str=xes.DEFAULT_NAME_KEY)-> np.chararray:


In [75]:
drift_1 = log[1197:1202]

In [78]:
for i, trace in enumerate(drift_1):
    print(f"trace_{i}")
    for _ in trace:
        print(_['concept:name'])

trace_0
Register
Low Medical History
Create Questionnaire
Low Insurance Check
Prepare Notification Content
Send Notification by Phone
Send Notification by e-mail
Send Questionnaire
Receive Questionnaire Response
Archive
trace_1
Register
Low Medical History
Low Insurance Check
Create Questionnaire
Prepare Notification Content
Send Questionnaire
Receive Questionnaire Response
Archive
trace_2
Register
Low Medical History
Low Insurance Check
Create Questionnaire
Prepare Notification Content
Send Questionnaire
Skip Questionnaire
Archive
trace_3
Register
Low Medical History
Create Questionnaire
Low Insurance Check
Send Questionnaire
Receive Questionnaire Response
Prepare Notification Content
Send Notification by Phone
Send Notification by Post
Archive
trace_4
Register
Create Questionnaire
High Insurance Check
Prepare Notification Content
Send Questionnaire
Receive Questionnaire Response
Send Notification by Post
Archive
