In [1]:
import json
import os
import pandas as pd
import torch

from openai import OpenAI
from threading import Thread
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
from util import build_event_jsons, llm_call, llm_gen, get_dfg_abstraction, get_variant_abstraction, get_petri_net_abstraction

pd.set_option('display.max_rows', None)

#"Qwen/Qwen3-8B"
#"meta-llama/Meta-Llama-3.1-8B-Instruct"
#"deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
#"gpt-4.1-mini"
#"gpt-4o-mini"
#"gpt-4.1"
#"gpt-4o"

model_ = "gpt-4o-mini"

api_key = "sk-proj-###"

llm = llm_call(model_version = model_, api_key= api_key)

In [2]:
SYSTEM_PROMPT = """
You are an event log imperfection pattern detector for process mining event log data quality.
You will receive a JSON ARRAY containing events.
The input scope may vary from a **SINGLE CASE** to **MULTIPLE CASES**.

Your job is to read event logs and identify events that exhibit imperfection patterns.
Imperfection patterns are recurring data quality issues that arise from how events are
captured, labeled, grouped into cases, or timestamped.

This is NOT generic anomaly detection.
Evaluate the event log strictly according to the imperfection pattern definitions
provided in this system prompt.

--------------------------------
ADAPTIVE ANALYSIS STRATEGY
--------------------------------
1. DETERMINE INPUT SCOPE:
   - Check if the input contains events from a single `case_id` or multiple distinct `case_id`s.

2. IF MULTIPLE CASES (Cross-Case Validation):
   - Leverage the available context to confirm patterns.
   - **Distinguish Systematic Error vs. Standard Process:**
     - CAUTION: Do NOT assume that a pattern is "correct" just because it appears frequently.
     - **Systematic Imperfections:** If a pattern matches the definition of "Form-based Capture" (Pattern 1) or "Polluted Label" (Pattern 3) consistently across many cases, it is a **Systematic Data Quality Issue**. It MUST be flagged.
     - **Standard Process Variants:** Only treat recurring structural sequences as "Standard Variants" if they do NOT violate any specific imperfection pattern rules.
   
   - **Canonical Form Determination (For Pattern 6):**
     - **Priority 1 (Linguistic Validity):** Use internal knowledge of vocabulary. If one variant is a valid dictionary word/standard business term (e.g., "Receive") and the other is a typo (e.g., "Recieve"), the valid word is ALWAYS the "Canonical Form", regardless of frequency.
     - **Priority 2 (Frequency):** Use frequency ONLY when linguistic checking is N/A (e.g., alphanumeric codes like "A_001" vs "A_OO1"). In these cases, the dominant value is "Canonical".

   - **Vocabulary Alignment (For Pattern 5):**
     - Compare activity vocabularies. If Case A uses "Submit" and Case B uses "Submission" in equivalent process steps, detect them as Synonymous Labels.

3. IF SINGLE CASE (Local Context Only):
   - Rely STRICTLY on the internal structural and semantic rules defined in each pattern.
   - **Disable Frequency Logic:** Do NOT attempt statistical outlier detection or assume a label is "rare", as the sample size is insufficient.
   - **For Pattern 6 (Distorted Label):** Rely EXCLUSIVELY on Linguistic Validity (Dictionary/Business Term check).
     - If Label A is a valid word and Label B is a typo, flag B.
     - **Constraint:** If BOTH are non-dictionary codes (e.g., "X_001" vs "X_0O1") and you cannot determine the canonical form linguistically, remain CONSERVATIVE and do NOT flag them (return empty), rather than guessing.
     
--------------------------------
ANALYSIS PRINCIPLES
--------------------------------
- **Internal Sorting:** Before analysis, mentally SORT events by `timestamp` within each `case_id`. Chronological order is critical for detecting sequential patterns (e.g., Collateral Events, Homonymous Loops).
- **Independence:** Analyze each case independently. Use cross-case insights ONLY as defined in the "ADAPTIVE ANALYSIS STRATEGY".
- **Strict Adherence:** Be conservative. Detect patterns ONLY when the Detection rule is satisfied exactly. Do not flag "suspicious" events that do not meet the strict criteria.
- **Overlap:** An event may satisfy zero, one, or multiple imperfection patterns.
- **Factuality:** Treat all field values (case_id, timestamp, activity) as recorded facts. Do NOT infer missing events or correct typos implicitly (unless flagging them as Pattern 6).
- **Verbatim Output:** When outputting result events, you must preserve the original values character-by-character (especially trailing zeros in timestamps).
- **Self-Correction:** Before generating the final JSON, review your list. If an event is flagged but doesn't strictly meet the rule, remove it.
- **Completeness:** If no events match any pattern, return the standard JSON structure with empty lists for all keys.

--------------------------------
INPUT
--------------------------------
You will receive a JSON ARRAY representing one or more CASES.

Each event object has the following structure:
- event_id (string): Unique identifier for the event.
- case_id (string): Identifier for the process instance.
- timestamp (string): String format "%Y-%m-%d %H:%M:%S.%f". 
  * NOTE: For Pattern 1, treat this as an opaque string. For Pattern 2, parse it to calculate time differences.
- activity (string): Name of the process step.

Events in the input array may be interleaved (mixed cases) and unsorted.

--------------------------------
OUTPUT (STRICT JSON ONLY)
--------------------------------
Return ONLY a single valid JSON object. 
DO NOT include markdown formatting (like ```json ... ```). 
DO NOT include any introductory or concluding text.

The top-level JSON object MUST contain exactly one key per imperfection pattern defined below.
Each key maps to a list of event objects that satisfy that pattern’s detection rule.

The same event MAY appear in the lists of multiple imperfection patterns
if it satisfies the detection rules of those patterns independently.

Output schema:

{
  "<PATTERN_NAME>": [
    {
      "event_id": <string>,
      "case_id": <string>,
      "timestamp": <string>,
      "activity": <string>
    }
  ]
}

OUTPUT RULES:
- **Valid JSON:** The output must be parseable by `json.loads()` in Python without errors. Handle escaping for special characters in activity names if necessary.
- **Completeness:** Every pattern key defined in definitions MUST be present. If no events are detected for a pattern, the value MUST be an empty list `[]`.
- **Identity:** Each list item must be a complete copy of the input event object.
- **Data Integrity:** The output `case_id`, `timestamp`, and `activity` MUST match the input event verbatim. 
  * CRITICAL: Do NOT truncate milliseconds (e.g., maintain ".000000" if present).
- **No Duplicates:** Do not list the same event object twice within the SAME pattern list.
- **Grouping:** Do NOT output partial groups. If a pattern rule says "tag the pair", both events must be in the list.

--------------------------------
IMPERFECTION PATTERN DEFINITIONS
--------------------------------

Pattern 1: Form-based Event Capture
(Output key name MUST be exactly "Form-based Event Capture")

Description:
This pattern occurs when multiple events within the same case are recorded with exactly the same timestamp (collision). Even if this is the "normal" behavior of the source system (e.g., batch processing), it creates ordering ambiguity for process mining algorithms and MUST be flagged.

Detection Rule:
- Context Awareness:
  - (Multi-Case): Check if the *same set of activities* frequently shares identical timestamps across different cases. This confirms systematic form-based capture.

- Scope:
  - Consider only events within the SAME case_id.

- Core Logic:
  - For each distinct timestamp STRING VALUE in the case:
    1. Compare timestamp values using FULL string equality (character-by-character).
    2. Treat timestamps as opaque strings. Do NOT use parsing or partial matching.
    3. If a timestamp string value appears **2 or more times** exactly within the same case_id, the group qualifies.

- Independence from Activity Labels:
  - Do NOT filter out events based on their activity names.
  - Even if the simultaneous events share the EXACT SAME activity name, they MUST still be tagged if timestamps collide.

- Output:
  - Tag ALL events whose timestamp matches the colliding value. Do NOT output partial groups.

Pattern 2: Collateral Events
(Output key name MUST be exactly "Collateral Events")

Description:
This pattern captures groups of events within the same case that represent a SINGLE logical process step but are recorded as multiple events due to:
1. Redundant system-level logging (echoes) occurring in milliseconds.
2. Inter-system communication (e.g., distinct systems logging the same update) occurring within a short timeframe.
Essentially, these are multiple low-level representations of ONE business action.

Detection Rule:
- Context Awareness:
  - (Multi-Case): Distinguish "Process Flow" from "Collateral Cluster".
    - **Process Flow (Normal):** Distinct steps (e.g., A -> B -> C) appearing together but with variable time gaps due to human/business factors.
    - **Collateral Events (Imperfection):** A **cluster of activities** (e.g., pair, triplet, or larger set) that appear as an **indivisible atomic unit**. They exhibit rigid temporal alignment across all cases, suggesting they are mechanically triggered as a single block.

- Scope:
  - Consider only events within the SAME case_id.

- Core Logic:
  1. Identify temporal clusters of events.
     - Criterion: A group of contiguous events occurring within a **short relative timeframe (e.g., < 300 seconds)**.
     - Note: Do not look only for pairs. Look for ANY group (size >= 2) that fits the timeframe.

  2. For each cluster, apply the **SEMANTIC REDUNDANCY CHECK**:
     - Do these multiple events semantically map to **ONE single business step**?
     - Indicator A (Shared Object): Activities act on the same entity (e.g., "Create Invoice", "Invoice Created", "Invoice Sent").
     - Indicator B (Technical Verbosity): The sequence represents a technical transaction rather than business progress (e.g., "Start", "Try", "Commit", "Ack").
  
  3. Filter:
     - Keep ONLY the events that satisfy the Semantic Redundancy Check.
     - Exclude distinct business steps that naturally happen fast.

- Output:
  - Tag ALL AND ONLY the filtered events from step 3.

Pattern 3: Polluted Label
(Output key name MUST be exactly "Polluted Label")

Description:
This pattern occurs when multiple distinct activity values in the same case share the same immutable (boiler-plate) structure, but differ due to mutable qualifiers embedded in the activity string (e.g., IDs, numbers).

Detection Rule:
- Context Awareness:
  - (Multi-Case): Compare activity strings across cases. If a specific substring varies between cases while the rest remains static, this confirms a mutable identifier.

- Scope:
  - Consider only events within the SAME case_id.

- Core Logic:
  - Detect "mutable qualifier substrings" in the activity string:
    - Type A (Long Numeric): Regex `[0-9]{8,}`
    - Type B (Mixed Code): Regex `[A-Za-z0-9]{6,}` containing at least one digit AND one letter.
  
  - Delimiter Set:
    - Characters: `_` `-` `:` `/` `#` `.` ` ` (space)
  
  - Adjacency Condition:
    - The mutable substring MUST be immediately adjacent (left or right) to at least one delimiter.

- Output:
  - Tag an event if it satisfies BOTH the mutable substring check and the adjacency condition.

Pattern 4: Homonymous Label
(Output key name MUST be exactly "Homonymous Label")

Description:
This pattern occurs when the same activity label is used for **multiple semantically distinct process steps**. Unlike a "True Loop" (where the exact same task is repeated), Homonymous Labels hide different business actions (e.g., "Assess" at start vs. "Assess" after appeal) under a single coarse-grained name, creating ambiguity in the process flow.

Detection Rule:
- Context Awareness:
  - (Multi-Case): Check consistency of repetition to distinguish "Safe Routine" from "Systematic Homonym".
    - **Safe Routine (Batching):** If immediate repetition (A -> A) is observed across MOST cases, confirm it as a standard batching behavior. -> **IGNORE.**
    - **Systematic Homonym (Process Loop):** If a disjoint repetition (A -> ... -> A) is observed across MOST cases, it confirms a "Standard Process Loop". In Process Mining, these are high-value Homonyms (hiding state changes like "Re-work"). -> **MUST FLAG.**

- Scope:
  - Consider only events within the SAME case_id.

- Core Logic:
  - For each distinct activity STRING VALUE in the case:
    1. Collect ALL events strictly matching that string.
    2. If Count < 2, ignore.
    3. If Count >= 2, apply **CONTEXTUAL DIVERGENCE CHECK**:
       
       - **Condition A (Phase Shift - STRONG SIGNAL):** - The activity reappears after a sequence of *different* intervening events (e.g., A -> B -> C -> A). 
         - Logic: The intervening events usually change the state of the case, implying the second 'A' has a different context (e.g., "Review" vs "Re-Review"). **MUST FLAG.**

       - **Condition B (Long-Gap Self-Loop):**
         - The activity repeats immediately (A -> A) BUT with a significant time gap (relative to average case duration).
         - Logic: Indicates distinct sessions or logical restarts rather than a continuous batch task. **FLAG.**

       - **Condition C (Simultaneous):**
         - The activity repeats at the exact same timestamp. **FLAG.**

  - **Exception (Safe Repetitions - Do NOT Flag):**
    - **Short-Term Immediate Self-Loop:** - The activity repeats immediately (A -> A -> A) within a short timeframe.
      - **Multi-Case Validation:** If this "stuttering" is frequent across other cases, it confirms "Systematic Batch Entry". Treat as standard routine and **IGNORE**.

- Output:
  - Tag ALL events in every qualifying activity group. Do NOT output partial groups.

Pattern 5: Synonymous Labels
(Output key name MUST be exactly "Synonymous Labels")

Description:
This pattern refers to a situation where a group of activity labels are syntactically different but semantically equivalent (aliases). This typically occurs when merging data from heterogeneous systems.

Detection Rule:
- Context Awareness:
  - (Multi-Case): Compare activity vocabularies ACROSS different cases. If different cases use distinct terms to describe a structurally equivalent step, treat them as candidates.

- Scope:
  - Analyze the entire set of distinct activity STRING VALUES found in the input.

- Core Logic:
  - For each pair of distinct activity strings (A, B), detect if they satisfy EITHER:
  
  1) Syntactic Similarity (Format/Style Variations Only):
     - The strings are strictly unequal but structurally very close due to:
       - Differences in case sensitivity (upper/lower case).
       - Differences in punctuation, spacing, or delimiters (e.g., "Step_1" vs "Step 1").
       - Differences in word order.
       - Standard abbreviations (e.g., "Acct" vs "Account").
     - **NOTE:** Do NOT include random typos here (handle in Pattern 6).

  2) Strong Semantic Equivalence (Synonyms):
     - The strings are textually distinct but represent the IDENTICAL real-world process step.
     - Criteria: One is a formal name, the other a colloquial alias/code (e.g., "Ship" vs "Dispatch").

- Exclusion Rule:
  - Do NOT tag pairs representing distinct process states (e.g., Start vs End).
  - Do NOT tag pairs differing by specific numeric IDs (handle in Pattern 3).

- Output:
  - If a pair (A, B) is detected, tag ALL events with activity A OR B. Do NOT output partial groups.

Pattern 6: Distorted Label
(Output key name MUST be exactly "Distorted Label")

Description:
This pattern refers to activity values containing specific character-level corruptions (typos, OCR faults) of a correct canonical form. Unlike "Synonymous Labels", this pattern identifies clear "Noise" (distorted) versus "Signal" (canonical).

Detection Rule:
- Context Awareness (Determine Canonical vs. Distorted):
  - **Rule 1: Linguistic Correctness (Primary Priority):**
    - Check strings against standard dictionary words or business terminology.
    - If one is valid and the other is invalid (e.g., "Start" vs "Strat"), the valid word is ALWAYS "Canonical".
    - **CRITICAL:** Apply this FIRST. Dictionary check overrides frequency.
  
  - **Rule 2: Frequency & Consistency (Secondary Priority - Tie-Breaker):**
    - Apply this if Rule 1 is **INCONCLUSIVE** (e.g., **Case Mutation** where both "Submit" and "submit" are valid words) OR **IMPOSSIBLE** (e.g., codes like "X_001" vs "X_0O1").
    - **Logic:** Compare with the majority. The string appearing in the majority of cases is "Canonical" (Standard). The rare variation is "Distorted" (Noise).

- Scope:
  - Analyze the set of distinct activity STRING VALUES found in the input.

- Core Logic:
  - Detect if a pair (A, B) exhibits one of these 5 distortion types:
    1) Case Mutation: Identical chars, different capitalization (e.g., "Open" vs "open").
    2) Character Omission (Skip): Exactly ONE missing character.
    3) Character Insertion (Insert): Exactly ONE extra character.
    4) Character Transposition (Interchange): Two adjacent chars swapped.
    5) Keyboard Proximity: Exactly ONE char substituted by a QWERTY neighbor.

- Exclusion Rule:
  - Do NOT tag pairs differing by variable IDs/numbers (Pattern 3).
  - Do NOT tag completely different words (Pattern 5).

- Output:
  - If a pair is detected:
    1. Identify ONLY the events with the "Distorted" label (the error) using Rule 1 & 2.
    2. Tag ONLY these distorted events.
    3. **STRICTLY EXCLUDE** the "Canonical" (correct) events from the output.
    
"""




In [7]:
#"./dataset/Pub-Collateral.csv" #Pub-Formbased.csv"
#"./dataset/BPIC11-FormBased.csv"
#"./dataset/Pub-Formbased.csv"
#"./dataset/Credit-Formbased.csv"
#"./dataset/BPIC11-CollateralEvent.csv"
# "./dataset/Pub-Polluted.csv"
#"./dataset/BPIC11-Polluted.csv"
#"./dataset/Pub-Synonymous.csv"
#BPIC15-Homonymous
#"./dataset/BPIC15-Synonymous.csv"
#"./dataset/Pub-Homonymous.csv" <- wrong
#"./dataset/Credit-Homonymous.csv"
LOG_NAME = "./dataset/Credit-Homonymous.csv"

df_new, cases_json = build_event_jsons(log_name = LOG_NAME, chunk_cases = 5)


idx = 38#682

USE_VARIANT   = True  
USE_DFG       = True  
USE_PETRI_NET = True   


In [4]:
dfg_str = get_dfg_abstraction(df_new)
print(dfg_str)

### DIRECTLY-FOLLOWS GRAPH (DFG)
This abstraction captures the direct succession of activities (A -> B) to identify process flows and bottlenecks.
- **Total Transitions**: 25 unique paths observed.
- **Metrics**: 'transition_count' (Frequency strength), 'mean_duration_seconds' (Average time taken between activities).

Request info -> info received (transition_count=6854, mean_duration_seconds=849.89)
info received -> Check for completeness (transition_count=6249, mean_duration_seconds=10.00)
Perform checks -> Make decision (transition_count=6083, mean_duration_seconds=906.75)
Check for completeness -> New online application received (transition_count=4547, mean_duration_seconds=0.00)
Check for completeness -> Request info (transition_count=3925, mean_duration_seconds=874.72)
Check for completeness -> Perform checks (transition_count=3860, mean_duration_seconds=824.69)
Make decision -> notify reject (transition_count=3406, mean_duration_seconds=711.92)
Deliver card -> EVENT 13 END (tran

In [5]:
variant_str = get_variant_abstraction(df_new)
print(variant_str)

### PROCESS VARIANTS (Top 50)
This list represents the most frequent activity sequences (paths) found in the event log.
- **Summary**: Showing top 50 out of 334 unique variants.
- **Coverage**: These variants cover **4475** out of **5000** total cases (**89.5%**).

Check for completeness -> New online application received -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (case_count=1081, mean_case_duration_seconds=3921.89)
Check for completeness -> New online application received -> Perform checks -> Make decision -> notify reject -> time out -> EVENT 13 END (case_count=595, mean_case_duration_seconds=3258.51)
Check for completeness -> New online application received -> Request info -> info received -> Check for completeness -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (case_count=562, mean_case_duration_seconds=5770.65)
Check for completeness -> New online application received -> Request info -> info received ->

In [6]:
petri_net_str = get_petri_net_abstraction(df_new)
print(petri_net_str)

### PETRI NET ABSTRACTION (Process Flow)
**Mining Parameter**: Heuristics Miner (Dependency Threshold = 0.5).
Use this structure to understand the logical sequence of activities.
- **Nodes**: 'PROCESS_START' (Start), 'PROCESS_END' (End), 'State_X' (Intermediate Stages).
- **Transitions**: Real activities (e.g., 'Submit') vs. 'ROUTING_LOGIC' (Invisible system logic for branching/merging).

**Visible Activities (Business Steps):**
['review request received', 'time out', 'EVENT 13 END', 'Perform checks', 'New online application received', 'Check:homonymous', 'info received', 'Check for completeness', 'Request info', 'Make decision', 'Notify accept', 'notify reject', 'Deliver card']

**Process Flows (Arcs):**
(State_0 -> ROUTING_LOGIC_25)
(State_5 -> Deliver card)
(Check:homonymous -> State_7)
(ROUTING_LOGIC_14 -> State_1)
(ROUTING_LOGIC_26 -> State_9)
(State_4 -> ROUTING_LOGIC_26)
(State_8 -> ROUTING_LOGIC_14)
(Perform checks -> State_13)
(State_12 -> Request info)
(PROCESS_START -> ROUTI

In [8]:
context_parts = []
active_contexts = [] # 현재 활성화된 정보 이름 저장용

if USE_VARIANT:
    variant_str = get_variant_abstraction(df_new)
    context_parts.append(variant_str)
    active_contexts.append("Variant List")

if USE_DFG:
    dfg_str = get_dfg_abstraction(df_new)
    context_parts.append(dfg_str)
    active_contexts.append("DFG")

if USE_PETRI_NET:
    petri_net_str = get_petri_net_abstraction(df_new)
    context_parts.append(petri_net_str)
    active_contexts.append("Petri Net")

full_context_str = "\n\n".join(context_parts)

# PART 1 섹션 텍스트 생성
part1_section = ""
if full_context_str:
    part1_section = f"""
### PART 1: PROCESS ABSTRACTIONS (Global Knowledge)
The following information summarizes the **entire event log**. 
Use this to understand standard behaviors, frequent paths, and the official process flow.
**Cross-reference** the raw events in PART 2 with this context ({', '.join(active_contexts)}) to validate patterns.

{full_context_str}
"""
else:
    part1_section = "### PART 1: PROCESS ABSTRACTIONS\n(No global context provided. Analyze based on local patterns only.)"

hybrid_instruction_lines = []

if not full_context_str:
    hybrid_instruction_lines.append("   - **Local Analysis Only**: No global context is provided in PART 1. Rely STRICTLY on the internal logic of the raw event log in PART 2.")

else:
    hybrid_instruction_lines.append("   - Combine **Global Knowledge (PART 1)** with **Local Observation (PART 2)**.")
    
    if USE_DFG or USE_PETRI_NET:
        sources = []
        if USE_DFG: sources.append("DFG")
        if USE_PETRI_NET: sources.append("Petri Net")
        source_str = " or ".join(sources)
        hybrid_instruction_lines.append(f"   - *Structure Check*: If you suspect a 'Homonymous Label' (Loop) in PART 2, verify against the **{source_str}** in PART 1. If the loop is structurally present there, it is likely a systematic pattern.")

    if USE_VARIANT:
        hybrid_instruction_lines.append("   - *Frequency Check*: If you suspect 'Form-based Capture' (Timestamp collision), check the **Variant List** in PART 1. Random ordering of the same activities across frequent variants confirms the pattern.")

hybrid_analysis_block = "\n".join(hybrid_instruction_lines)

USER_PROMPT = f"""
{part1_section}

---

### PART 2: RAW EVENT LOG SNIPPET (Target for Analysis)
Detect imperfection patterns in the specific events below.

INPUT EVENT LOG (JSON):
{json.dumps(cases_json[idx], ensure_ascii=False)}

---
EXECUTION INSTRUCTIONS:
1. **Scope Analysis**: First, identify if the input contains a SINGLE case or MULTIPLE cases to apply the correct "ADAPTIVE ANALYSIS STRATEGY" defined in the System Prompt.
2. **Hybrid Analysis**: 
{hybrid_analysis_block}
3. **Strict Detection**: Apply the detection rules for all 6 patterns.
   - Be CONSERVATIVE: If an event is ambiguous or does not strictly meet the rule, exclude it.
   - Apply Cross-Case Validation where applicable.
4. **Data Integrity Check**: Ensure that `timestamp` (including milliseconds) and `activity` strings in your output match the input **character-by-character**.
5. **Format**: Return **ONLY** the raw JSON object.
   - NO Markdown code blocks (e.g., ```json ... ```).
   - NO explanatory text.
   - Ensure all 6 pattern keys exist in the output object.
"""



In [10]:
print(USER_PROMPT)



### PART 1: PROCESS ABSTRACTIONS (Global Knowledge)
The following information summarizes the **entire event log**. 
Use this to understand standard behaviors, frequent paths, and the official process flow.
**Cross-reference** the raw events in PART 2 with this context (Variant List, DFG, Petri Net) to validate patterns.

### PROCESS VARIANTS (Top 50)
This list represents the most frequent activity sequences (paths) found in the event log.
- **Summary**: Showing top 50 out of 334 unique variants.
- **Coverage**: These variants cover **4475** out of **5000** total cases (**89.5%**).

Check for completeness -> New online application received -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (case_count=1081, mean_case_duration_seconds=3921.89)
Check for completeness -> New online application received -> Perform checks -> Make decision -> notify reject -> time out -> EVENT 13 END (case_count=595, mean_case_duration_seconds=3258.51)
Check for completeness

In [5]:
prompt = [{"role": "system", "content": SYSTEM_PROMPT},
          {"role":"user","content": USER_PROMPT}]

test_output = llm_gen(model_version = model_, model_instance = llm, prompt = prompt)

In [6]:
print(test_output)

{'Form-based Event Capture': [{'event_id': '2431', 'case_id': '190', 'timestamp': '2023-10-10 06:38:32.551', 'activity': 'Check for completeness'}, {'event_id': '2432', 'case_id': '190', 'timestamp': '2023-10-10 06:38:32.551', 'activity': 'New online application received'}, {'event_id': '2483', 'case_id': '193', 'timestamp': '2023-10-10 07:54:35.806', 'activity': 'Check for completeness'}, {'event_id': '2484', 'case_id': '193', 'timestamp': '2023-10-10 07:54:35.806', 'activity': 'New online application received'}, {'event_id': '2451', 'case_id': '192', 'timestamp': '2023-10-10 07:53:53.954', 'activity': 'Check for completeness'}, {'event_id': '2452', 'case_id': '192', 'timestamp': '2023-10-10 07:53:53.954', 'activity': 'New online application received'}, {'event_id': '2493', 'case_id': '194', 'timestamp': '2023-10-10 11:30:21.295', 'activity': 'Check:homonymous'}, {'event_id': '2494', 'case_id': '194', 'timestamp': '2023-10-10 11:30:21.295', 'activity': 'New online application received

In [9]:
print('\nForm-based Event Capture')
for f in test_output['Form-based Event Capture']:
    print(f)
    
print('\nCollateral Events')
for c in test_output['Collateral Events']:
    print(c)
    
print('\nPolluted Label')
for p in test_output['Polluted Label']:
    print(p)

print('\nHomonymous Label')
for h in test_output['Homonymous Label']:
    print(h)

print('\nSynonymous Labels')
for s in test_output['Synonymous Labels']:
    print(s)

print('\nDistorted Label')
for d in test_output['Distorted Label']:
    print(d)


Form-based Event Capture
{'event_id': '2431', 'case_id': '190', 'timestamp': '2023-10-10 06:38:32.551', 'activity': 'Check for completeness'}
{'event_id': '2432', 'case_id': '190', 'timestamp': '2023-10-10 06:38:32.551', 'activity': 'New online application received'}
{'event_id': '2483', 'case_id': '193', 'timestamp': '2023-10-10 07:54:35.806', 'activity': 'Check for completeness'}
{'event_id': '2484', 'case_id': '193', 'timestamp': '2023-10-10 07:54:35.806', 'activity': 'New online application received'}
{'event_id': '2451', 'case_id': '192', 'timestamp': '2023-10-10 07:53:53.954', 'activity': 'Check for completeness'}
{'event_id': '2452', 'case_id': '192', 'timestamp': '2023-10-10 07:53:53.954', 'activity': 'New online application received'}
{'event_id': '2493', 'case_id': '194', 'timestamp': '2023-10-10 11:30:21.295', 'activity': 'Check:homonymous'}
{'event_id': '2494', 'case_id': '194', 'timestamp': '2023-10-10 11:30:21.295', 'activity': 'New online application received'}

Collate

In [10]:
eid2patterns: dict[str, list[str]] = {}
unique_case_ids = list(dict.fromkeys(e["case_id"] for e in cases_json[idx]))

for pattern_name, events in test_output.items():
    for e in events:
        eid = str(e["event_id"])
        eid2patterns.setdefault(eid, []).append(pattern_name)

df_new["detected"] = df_new["event_id"].map(lambda eid: eid2patterns.get(eid, []))
df_new[df_new['case_id'].isin(unique_case_ids)]

Unnamed: 0,event_id,case_id,timestamp,activity,label,detected
2431,2431,190,2023-10-10 06:38:32.551,Check for completeness,,[Form-based Event Capture]
2432,2432,190,2023-10-10 06:38:32.551,New online application received,,[Form-based Event Capture]
2433,2433,190,2023-10-10 06:43:20.725,Perform checks,,[]
2434,2434,190,2023-10-10 06:44:30.989,Make decision,,[]
2435,2435,190,2023-10-10 06:50:20.797,notify reject,,[]
2436,2436,190,2023-10-10 07:14:26.014,time out,,[]
2437,2437,190,2023-10-10 07:14:36.014,EVENT 13 END,,[]
2438,2438,191,2023-10-10 07:29:03.023,Check for completeness,,[]
2439,2439,191,2023-10-10 07:29:03.023,New online application received,,[]
2440,2440,191,2023-10-10 07:49:20.081,Request info,,[]
