## **Trace test relabelling and Entropy Analysis**

#### **Import log**

In [None]:
import pm4py
from pm4py.objects.log.obj import EventLog, Trace, Event
from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments
import datetime
import random
from pm4py.objects.log.importer.xes import importer as xes_importer
import numpy as np
from copy import deepcopy
import math
import itertools
from collections import defaultdict # mi permette di inizializzare un dizionario con valori di default (0 nel mio caso)
from pm4py.objects.petri_net import semantics

#### **Markov Process**

In [None]:
class Sample:
	def __init__(self, prefix, next_transition, trace_id, position):
		"""Represents a sample (x, y) where x is prefix and y is next transition"""
		self.prefix = prefix  # sequence of events before current position
		self.next_transition = next_transition  # transition at current position	
		self.trace_id = trace_id  # which trace this sample comes from
		self.position = position  # position in the alignment
	
class State:
	"""Represents a state in the Markov process"""
	def __init__(self, state_id, marking):
		self.state_id = state_id
		self.marking = marking
		self.samples = []  # List of Sample objects
		self.transitions = defaultdict(int)  # transition -> count
		self.total_samples = 0 # totale samples di ogni stato, mi serve come denominatore per calcolare le transition probabilities
	
	def add_sample(self, sample):
		self.samples.append(sample)
		self.transitions[sample.next_transition] += 1 # ogni volta che arriva in uno stato, ho la next_transition che arriva dal sample (che a sua volta arriva dal log aligned), sommo 1 ogni volta che compare questa next_transition
		self.total_samples += 1
	
	def get_transition_probabilities(self):
		"""Calculate transition probabilities from this state"""
		if self.total_samples == 0:
			return {}
		
		probabilities = {}
		for transition, count in self.transitions.items():
			probabilities[transition] = count / self.total_samples
		# print(f"State {self.state_id} transition probabilities: {probabilities}")
		return probabilities
		
	def calculate_entropy(self):
		"""Calculate entropy of this state"""
		probabilities = self.get_transition_probabilities()
		if not probabilities:
			return 0.0
		
		entropy = 0.0
		for prob in probabilities.values():
			if prob > 0:
				entropy -= prob * math.log2(prob)
		# print(f"State {self.state_id} entropy: {entropy}")
		return entropy


class MarkovProcess:
	def __init__(self, perfect_aligned_log, net, initial_marking, final_marking):
		self.perfect_aligned_log = perfect_aligned_log
		# print("Perfect aligned log inside MarkovProcess:", self.perfect_aligned_log)
		self.net = net
		self.initial_state = None
		self.initial_marking = initial_marking
		self.final_marking = final_marking
		self.total_positions = self.calculate_tot_positions() # numero di tracce * cardinalità della traccia
		self.state_visit_counts = defaultdict(int) # per ogni stato, quante volte lo stato è stato visitato
		
		self.states = self.get_states()  # state_id -> State object
		# print("States: ", self.states)
		self.process_entropy = self.calculate_entropy()
		# print("Process entropy: ", self.process_entropy)
		
	def get_states(self):
		# print("Building states from aligned log...")
		
		state_counter = 0
		marking_to_state = {}  # marking -> state_id
		states = {}
		for trace_idx, alignment in enumerate(self.perfect_aligned_log):
			current_marking = self.initial_marking.copy()
			for pos, (trace_event, net_transition) in enumerate(alignment):
				if current_marking is None:
					# print(f"Skipping trace {trace_idx}, pos {pos}: marking is None")
					break   # oppure continue, a seconda di cosa vuoi fare
				
				# chiave dello stato dal marking
				marking_key = tuple(sorted((str(p), c) for p, c in current_marking.items())) # creo la chiave per ogni stato
				
				# Se lo stesso marking c'è già in un’altra traccia, non creo un nuovo stato, ma aggiungio il nuovo sample allo stato già esistente.
				if marking_key not in marking_to_state: # durante l'iterazione dentro la prima traccia entro in questo if e creo tutti gli stati
					state_id = f"S{state_counter}"
					state = State(state_id, current_marking.copy())
					states[state_id] = state # aggiungo lo stato al dizionario degli stati
					marking_to_state[marking_key] = state_id # salvo la corrispondenza marking -> state_id
					state_counter += 1 # semplicemente conta gli stati totali
				else:
					state_id = marking_to_state[marking_key]

				# crea sample e lo aggiungo al mio stato per capire che la traccia in questione (trace_idx) al passo pos si trova in questo stato
				prefix = []
				for i in range(pos):
					prev_event, prev_trans = alignment[i]
					if prev_event and prev_event not in {">>", "ε"}: # posso toglierla se considero il perfect log
						prefix.append(prev_event)
					# else:
					# 	print("SKIPPPP")

				if net_transition and net_transition != ">>": # posso toglierla se considero il perfect log
					sample = Sample(prefix, net_transition, trace_idx, pos)
					states[state_id].add_sample(sample)
					self.state_visit_counts[state_id] += 1
					# self.total_positions += 1 

					# aggiorna marking
					transition_obj = next((t for t in self.net.transitions if t.label == net_transition), None)
					if transition_obj:
						try:
							current_marking = semantics.execute(transition_obj, self.net, current_marking)
						except Exception:
							current_marking = None
				# else:
				# 	print("Ciao")

		return states
	
	def calculate_tot_positions(self):
		total_position = sum(len(log) for log in self.perfect_aligned_log)
		# print("Total positions calculated: ", total_position)
		return total_position
	
	def calculate_state_probability(self):
		# denominator = sum(len(trace) for trace in self.perfect_aligned_log) # positioni totali del log
		state_probabilities = {}
		for state, visit_count in self.state_visit_counts.items():
			state_probabilities[state] = visit_count / self.total_positions
			# print(f"State {state} has probability {round(state_probabilities[state], 3)}")
		return state_probabilities
	
	def calculate_entropy(self):
		process_entropy = 0.0
		state_probabilities = self.calculate_state_probability()
		# print("State probabilities:", state_probabilities)
		for state in self.states.values():
			state_entropy = state.calculate_entropy()
			state_probability = state_probabilities[state.state_id]
			process_entropy += state_probability * state_entropy
		return process_entropy

#### **Relabelling**

In [None]:
class EnrichedLog:
	def __init__(self, aligned_log, original_log, k,
				 name_attr="concept:name", time_attr="time:timestamp",
				 include_model_moves=False, binary_as="tuple"):
		"""
		include_model_moves: se True mantiene anche gli step di model-move con label None
		binary_as: "tuple" (default) o "string" per la binary_label
		"""
		self.aligned_log = aligned_log
		self.original_log = original_log
		self.k = k
		self.name_attr = name_attr
		self.time_attr = time_attr
		self.include_model_moves = include_model_moves
		self.binary_as = binary_as

		self.label_to_code = {}
		self.code_index = 0
		self.all_codes = list(itertools.product([0, 1], repeat=k))

		self.enriched_log = self._build_enriched_log()

	def _get_original_events(self, trace_obj):
		# trace_obj può essere Trace (iterable di eventi), oppure dict {'events': [...]}, oppure lista di eventi
		if isinstance(trace_obj, dict) and "events" in trace_obj:
			return trace_obj["events"]
		# se è già una lista/Trace la trasformo in lista
		try:
			return list(trace_obj)
		except Exception:
			raise ValueError("Formato della trace originale non riconosciuto")

	def _extract_move_label(self, move):
		# move la uso per estrarre label da più parti (self._extract_move_label(model_move), self._extract_move_label(log_move))
		if move is None:
			return None
		if isinstance(move, str):
			return move
		if isinstance(move, (tuple, list)):
			# cerchiamo il primo elemento stringa radicato, es. ('Triage',) o ('Triage', 'Triage')
			for el in move:
				if isinstance(el, str):
					return el
				if isinstance(el, (tuple, list)) and len(el) > 0 and isinstance(el[0], str):
					return el[0]
			# fallback
			return str(move)
		return str(move)

	def _get_event_attr(self, event, attr):
		try:
			# se event è un dict-like o pm4py Event
			return event.get(attr) if hasattr(event, "get") else event[attr]
		except Exception:
			try:
				return event[attr]
			except Exception:
				return None

	def _to_timestamp_number(self, ts):
		if ts is None:
			return None
		if isinstance(ts, (int, float)):
			return float(ts)
		if isinstance(ts, datetime.datetime):
			# assicurati che sia timezone-aware o locale: timestamp() gestisce UTC
			return float(ts.timestamp())
		# fallback: prova a convertire
		try:
			return float(ts)
		except Exception:
			return None

	# core builder
	def _build_enriched_log(self):
		enriched_log = []

		for trace_idx, alignment_info in enumerate(self.aligned_log):
			# print(alignment_info)
			alignment = alignment_info["alignment"] # self._get_alignment_list(alignment_info)
			orig_trace_obj = self.original_log[trace_idx]
			original_events = self._get_original_events(orig_trace_obj)

			enriched_trace = []
			used_indices = set()
			search_start = 0

			for step in alignment:
				log_move, model_move = step
				log_label = self._extract_move_label(log_move)
				# identifica model-move
				if log_label is None or str(log_label).startswith(">>"):
					if self.include_model_moves:
						enriched_trace.append({
							"original_label": None,
							"binary_label": None,
							"timestamp": None,
							"model_transition": self._extract_move_label(model_move)
						})
					# non consumiamo eventi del log per model move
					continue

				# cercho il prossimo evento non usato con lo stesso concept:name
				matched_idx = None
				for j in range(search_start, len(original_events)):
					if j in used_indices:
						continue
					ev_name = self._get_event_attr(original_events[j], self.name_attr)
					if ev_name == log_label:
						matched_idx = j
						break

				# fallback: se non trovato, proviamo a usare il prossimo non usato in ordine
				if matched_idx is None:
					for j in range(search_start, len(original_events)):
						if j not in used_indices:
							matched_idx = j
							break

				if matched_idx is None:
					# non ci sono eventi rimanenti: assegniamo None
					original_label = log_label
					ts_num = None
				else:
					ev = original_events[matched_idx]
					original_label = self._get_event_attr(ev, self.name_attr)
					ts = self._get_event_attr(ev, self.time_attr)
					ts_num = self._to_timestamp_number(ts)
					used_indices.add(matched_idx)
					search_start = matched_idx + 1

				# assegna codice binario coerente per original_label
				code = None
				if original_label is not None:
					if original_label not in self.label_to_code:
						if self.code_index >= len(self.all_codes):
							raise ValueError(f"Non ci sono abbastanza codici binari (k={self.k}) per etichette uniche trovate; aumenta k")
						self.label_to_code[original_label] = self.all_codes[self.code_index]
						self.code_index += 1
					code = self.label_to_code[original_label]
					if self.binary_as == "string" and code is not None:
						code = "".join(map(str, code))

				enriched_trace.append({
					"original_label": original_label,
					"binary_label": code,
					"timestamp": ts_num,
					"model_transition": self._extract_move_label(model_move)
				})

			enriched_log.append(enriched_trace)

		return enriched_log


In [None]:
class TraceTest():
	def __init__(self, k=3, min_threshold=1000, max_threshold=10000):
		self.psi = [random.choice([-1, 0, 1]) for _ in range(k)]
		self.threshold = random.randint(0, max_threshold)

In [None]:
class RelabelledLog:
	def __init__(self, trace_test_set, log):
		self.trace_test_set = trace_test_set  # lista di trace tests
		self.log = log  # log originale già arricchito con timestamp e binary_label
		self.relabelled_log = self.relabelling_log()

	def relabelling_log(self):
		relabelled_log = []
		for trace in self.log:
			relabelled_trace = []
			n = len(trace)
			for i, event in enumerate(trace):
				tt_vector_event = []
				# Costruisci il vettore dei trace tests
				for trace_test in self.trace_test_set:
					psi = trace_test.psi
					threshold = trace_test.threshold
					if self.trace_test_holds_position_i(trace, i, psi, threshold):
						tt_vector_event.append(1)
						# print("Append 1") # confermato che aggiunge 1
					else:
						tt_vector_event.append(0)

				# Combina evento originale + vettore trace tests
				tt_vector_event_string = "".join(map(str, tt_vector_event))
				new_event = event['original_label'] + tt_vector_event_string
				enriched_event = {
					"original_label": new_event,
					"binary_label": event["binary_label"],
					"timestamp": event["timestamp"],
					"trace_tests": tt_vector_event
				}
				relabelled_trace.append(enriched_event)
			relabelled_log.append(relabelled_trace)

		return relabelled_log

	def trace_test_holds_position_i(self, trace, i, psi, threshold):
		t_i = trace[i]["timestamp"]
		# cerca un evento r <= i che soddisfa psi e sta nel threshold temporale
		for r in range(i, -1, -1):
			t_r = trace[r]["timestamp"]
			if t_i - t_r <= threshold:
				# print("Timestamp sat") # funziona, viene stampato
				if self.satisfies(trace[r], psi):
					return True
		return False

	def satisfies(self, event, psi):
		# print("Psi",psi)
		# print("Event", event)
		label = event['binary_label']
		for index, value in enumerate(psi):
			if value != -1:
				if label[index] != value:
					return False
		return True
	
	def print_trace_test_set(self):
		for trace_test in self.trace_test_set:
			print(trace_test.psi, trace_test.threshold)

#### **Main function**

In [None]:
log = xes_importer.apply("C:/Users/Simone/Desktop/UNIVERSITA/MAGISTRALE/BIOMEDICAL DECISION SUPPORT SYSTEM/Assignments/2/log-10-percent-noise.xes.gz")

In [None]:
def create_event_log(traces_list):
	log = EventLog()
	for trace_data in traces_list:
		trace = Trace()
		for event_data in trace_data[:-1]:
			event = Event(event_data)
			trace.append(event)
		log.append(trace)
	
	return log

# Build the perfect alignment
def build_perfect_alignment(aligned_traces):
	perfect_aligned_log = []
	for aligned in aligned_traces:
		aligned_trace = []
		for move_log, move_model in aligned:
			# keep synchronized moves
			if move_log not in [None, ">>"] and move_model not in [None, ">>"]:
				aligned_trace.append((move_log, move_model))
			
			# insert dummy for visible model moves (if present)
			elif move_log in [None, ">>"] and move_model not in [None, ">>", "tau"]:
				aligned_trace.append(f"{move_model}_DUMMY")
			
			# skip all other cases (trace-only moves or tau)
		
		perfect_aligned_log.append(aligned_trace)
	return perfect_aligned_log

def to_pm4py_log(log):
	pm_log = EventLog()
	for trace in log:
		pm_trace = Trace()
		for event in trace:
			ts = datetime.datetime.fromtimestamp(event['timestamp'])
			ev = Event({
				"concept:name": event['original_label'],
				"time:timestamp": ts
			})
			pm_trace.append(ev)
		pm_log.append(pm_trace)
	return pm_log

def filter_trace(aligned_traces):
	aligned_log = []
	for al in aligned_traces:
		aligned_log.append(al['alignment'])
	return aligned_log

def solve(log, net, initial_marking, final_marking):
	markov_process = MarkovProcess(log, net, initial_marking, final_marking)
	return markov_process.process_entropy

def main():
	# Import del log fatto nella cella prima per evitare di caricarlo ogni volta
	original_log = log[0:100] # più aumento, più scende l'entropia
	original_log = create_event_log(original_log)

	# ======================
	# Enriched log originale
	# ======================
	# Scopro rete e faccio alignment sul training log
	net, im, fm = pm4py.discover_petri_net_inductive(original_log)
	aligned_train = alignments.apply_log(original_log, net, im, fm)

	# ============================
	# Trace test A (singolo) e B (multiplo)
	# ============================
	k = 3
	min_threshold = 10000
	max_threshold = 100000
	trace_test_set_a = [TraceTest(k, min_threshold, max_threshold) for _ in range(1)]
	trace_test_set_b = [TraceTest(k, min_threshold, max_threshold) for _ in range(10)]
	
	# ======================
	# Original and Relabelled logs
	# ======================
	
	original_enriched_log_obj = EnrichedLog(aligned_train, original_log, k)
	relabelled_log_obj_a = RelabelledLog(trace_test_set_a, original_enriched_log_obj.enriched_log)
	relabelled_log_obj_b = RelabelledLog(trace_test_set_b, original_enriched_log_obj.enriched_log)
	print("Enriched original log:", original_enriched_log_obj.enriched_log[0][0])
	print("Relabelled log A:", relabelled_log_obj_a.relabelled_log[0][0])
	print("Relabelled log B:", relabelled_log_obj_b.relabelled_log[0][0])

	# ======================
	# Petri nets dai log
	# ======================
	net_original_log, im_original, fm_original = pm4py.discover_petri_net_inductive(to_pm4py_log(original_enriched_log_obj.enriched_log))
	net_relabelled_log_a, im_a, fm_a = pm4py.discover_petri_net_inductive(to_pm4py_log(relabelled_log_obj_a.relabelled_log))
	net_relabelled_log_b, im_b, fm_b = pm4py.discover_petri_net_inductive(to_pm4py_log(relabelled_log_obj_b.relabelled_log))

	# ======================
	# Alignment
	# ======================
	aligned_traces_original_log = alignments.apply_log(to_pm4py_log(original_enriched_log_obj.enriched_log), net_original_log, im_original, fm_original)
	aligned_traces_relabelled_log_a = alignments.apply_log(to_pm4py_log(relabelled_log_obj_a.relabelled_log), net_relabelled_log_a, im_a, fm_a)
	aligned_traces_relabelled_log_b = alignments.apply_log(to_pm4py_log(relabelled_log_obj_b.relabelled_log), net_relabelled_log_b, im_b, fm_b)
	
	# ======================
	# Filter alignment
	# ======================
	aligned_traces_original_log = filter_trace(aligned_traces_original_log)
	aligned_traces_relabelled_log_a = filter_trace(aligned_traces_relabelled_log_a)
	aligned_traces_relabelled_log_b = filter_trace(aligned_traces_relabelled_log_b)
	
	# ======================
	# Perfect log aligned
	# ======================
	# aligned_traces_original_log = build_perfect_alignment(aligned_traces_original_log)
	# aligned_traces_relabelled_log_a = build_perfect_alignment(aligned_traces_relabelled_log_a)
	# aligned_traces_relabelled_log_b = build_perfect_alignment(aligned_traces_relabelled_log_b)
	
	# ======================
	# Entropie
	# ======================
	entropy_process_original_log = solve(aligned_traces_original_log, net_original_log, im_original, fm_original)
	entropy_process_relabelled_log_a = solve(aligned_traces_relabelled_log_a, net_relabelled_log_a, im_a, fm_a)
	entropy_process_relabelled_log_b = solve(aligned_traces_relabelled_log_b, net_relabelled_log_b, im_b, fm_b)
	
	print("Entropy Original:", entropy_process_original_log)
	print("Entropy Relabelled A:", entropy_process_relabelled_log_a)
	print("Entropy Relabelled B:", entropy_process_relabelled_log_b)
	

main()	

## **Conclusioni**

#### **Dimensione del log e entropia**
- Più grande è il log di partenza, più bassa tende a essere l'entropia del processo scoperto.
- Questo perché log più grandi forniscono più esempi di comportamento, riducendo l'incertezza nella Petri Net scoperta.
- Con log piccoli, l'algoritmo ha meno esempi e quindi l'entropia è più alta.

---

#### **TraceTest e relabelling**
- Il relabelling dei TraceTest permette di costruire Petri Net più specifiche e dettagliate.
- Ogni evento del log viene reso unico o più distinguibile:
  - `Triage` → `Triage001`
  - `Check` → `Check002`
- Questo riduce ambiguità durante la scoperta della rete, migliorando la fedeltà della Petri Net al log originale.
- L’entropia dei log relabelled tende a essere più bassa o più stabile rispetto al log originale.

---

#### **Analisi combinazioni di TraceTest**
- Generare più TraceTest esplora più combinazioni di eventi, aumentando la varietà dei codici binari, rendendo ogni trace più unico.
