In [None]:
!pip install pm4py



In [None]:
!pip install pm4py pandas

import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery

file_path = "/content/Credit-TRAIN-HOMONYM-0.1-0.CSV"
df = pd.read_csv(file_path)

df_pm4py = df.rename(columns={
    "Case": "case:concept:name",
    "Activity": "concept:name",
    "Timestamp": "time:timestamp"
})
df_pm4py["time:timestamp"] = pd.to_datetime(df_pm4py["time:timestamp"], errors='coerce')
df_pm4py = dataframe_utils.convert_timestamp_columns_in_df(df_pm4py)


log = log_converter.apply(df_pm4py, variant=log_converter.Variants.TO_EVENT_LOG)
dfg = dfg_discovery.apply(log)
dfg_text = [f"{source} -> {target} : {count}" for (source, target), count in dfg.items()]


for line in dfg_text:
    print(line)

with open("DFG_as_text.txt", "w") as f:
    for line in dfg_text:
        f.write(line + "\n")

print("DFG 전처리 완료 및 저장 완료!")




  return fn(*args, **kwargs)


Check for completeness -> New online application received : 3587
New online application received -> Perform checks : 1549
Perform checks -> Make decision : 4820
Make decision -> Notify accept : 2355
Notify accept -> Deliver card : 2355
Deliver card -> EVENT 13 END : 2629
New online application received -> application check : 187
application check -> Make decision : 533
New online application received -> Request info : 1631
Request info -> info received : 4461
info received -> Check for completeness : 4979
Check for completeness -> Perform checks : 3089
Make decision -> notify reject : 2480
notify reject -> time out : 1256
time out -> EVENT 13 END : 1371
Check for completeness -> receive information : 412
receive information -> Perform checks : 182
notify reject -> review request received : 1103
review request received -> Check for completeness : 1222
New online application received -> request information : 220
request information -> info received : 518
Check for completeness -> Request

In [None]:
import openai
client = openai.OpenAI(api_key="")

with open("DFG_as_text.txt", "r") as f:
    lines = f.readlines()
preprocessed_text = "".join(lines[:50])

prompt = f"""
You are an expert in analyzing business process logs. The following input describes several process cases with sequences of activities.

Your task is to identify **homonym-based errors**, where:
- An activity uses a name that looks similar or identical to a correct activity,
- But the meaning or use of the word is inconsistent with the typical process flow.
- These errors may involve word ambiguity, misuse of similar terms, or naming inconsistencies.

Please examine the event traces carefully and highlight activities that:
- Have ambiguous or misleading names,
- Appear out of place based on context,
- Or could be mistaken due to being homonyms or near-homonyms of valid process activities.

For each suspected homonym error, provide the following:
- Case ID (if available)
- The ambiguous or incorrect activity name
- Its position in the sequence (e.g., 2nd activity)
- The expected or likely correct activity name
- A brief explanation of why the activity is likely a homonym error

Here is the process log input: ##에 대한 설명 더 추가해야핟다
{preprocessed_text}

Return your findings in a structured format.
"""


response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a business process analysis expert."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.2,
    max_tokens=3000
)

print(response.choices[0].message.content)


### Homonym-based Errors Identified:

1. **Case ID:** Not available
   - **Incorrect Activity:** application check
   - **Position:** 2nd activity
   - **Expected Correct Activity:** Perform checks
   - **Explanation:** The activity "application check" is likely a homonym error as it should be "Perform checks" based on the context of the process flow.

2. **Case ID:** Not available
   - **Incorrect Activity:** request information
   - **Position:** 2nd activity
   - **Expected Correct Activity:** Request info
   - **Explanation:** The activity "request information" is likely a homonym error as it should be "Request info" to maintain consistency in naming conventions.

3. **Case ID:** Not available
   - **Incorrect Activity:** send notification
   - **Position:** 2nd activity
   - **Expected Correct Activity:** Notify accept
   - **Explanation:** The activity "send notification" is likely a homonym error as it should be "Notify accept" to align with the process flow.

4. **Case ID:** No

In [None]:
prompt = f"""
You are an expert in analyzing business process logs. The following input describes several process cases with sequences of activities.

Your task is to identify **synonym-based errors**, where:
- An activity name uses a different word or phrase that **has the same or similar meaning** as the correct activity label,
- But it causes **inconsistency** in naming conventions or violates the expected vocabulary of the process,
- These errors often result from inconsistent use of synonyms, paraphrasing, or overly verbose labels.

Please examine the event traces carefully and highlight activities that:
- Use synonyms instead of the standard activity name,
- Introduce unnecessary variation in phrasing,
- Or conflict with the established terminology in the same process.

For each suspected synonym-based error, provide the following:
- Case ID (if available)
- The inconsistent or alternate activity name
- Its position in the sequence (e.g., 4th activity)
- The expected or standardized activity name
- A brief explanation of why the activity is likely a synonym error and how it deviates from standard naming

Here is the process log input:
{preprocessed_text}

Return your findings in a structured format.
"""


In [None]:
import openai
import time

client = openai.OpenAI(api_key="")

with open("DFG_as_text.txt", "r") as f:
    lines = f.readlines()

chunk_size = 50
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]

all_results = []

# 각 Chunk에 대해 반복 처리
for i, chunk in enumerate(chunks):
    preprocessed_text = "".join(chunk)


    prompt = f"""
You are an expert in analyzing business process logs. The following input describes several process cases with sequences of activities.

Your task is to identify **homonym-based errors**, where:
- An activity uses a name that looks similar or identical to a correct activity,
- But the meaning or use of the word is inconsistent with the typical process flow.
- These errors may involve word ambiguity, misuse of similar terms, or naming inconsistencies.

Please examine the event traces carefully and highlight activities that:
- Have ambiguous or misleading names,
- Appear out of place based on context,
- Or could be mistaken due to being homonyms or near-homonyms of valid process activities.

For each suspected homonym error, provide the following:
- Case ID (if available)
- The ambiguous or incorrect activity name
- Its position in the sequence (e.g., 2nd activity)
- The expected or likely correct activity name
- A brief explanation of why the activity is likely a homonym error

Here is the process log input:
{preprocessed_text}

Return your findings in a structured format.
"""

    # GPT-3.5-turbo 호출
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a business process analysis expert."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=3000
    )

    result = response.choices[0].message.content
    all_results.append(result)

    print(f"\n=== 결과 (Chunk {i+1}) ===\n{result}")


with open("anomaly_results.txt", "w") as f:
    for chunk_result in all_results:
        f.write(chunk_result + "\n\n")

print("\n✅ 모든 Chunk 결과 저장 완료: anomaly_results.txt")


=== 결과 (Chunk 1) ===
### Homonym-Based Errors Identified:

1. **Case ID:** 1549
   - **Ambiguous Activity Name:** Perform checks
   - **Position:** 2nd activity
   - **Expected Correct Activity Name:** Application check
   - **Explanation:** The activity "Perform checks" is likely a homonym error as it should be "Application check" based on the context of processing a new online application.

2. **Case ID:** 2355
   - **Ambiguous Activity Name:** Notify accept
   - **Position:** 4th activity
   - **Expected Correct Activity Name:** Send notification
   - **Explanation:** The activity "Notify accept" is likely a homonym error as it should be "Send notification" to maintain consistency in the process flow.

3. **Case ID:** 2355
   - **Ambiguous Activity Name:** Deliver card
   - **Position:** 5th activity
   - **Expected Correct Activity Name:** Deliver card
   - **Explanation:** The activity "Deliver card" is likely a homonym error as it is repeated and may cause confusion in the proce

----------------------------

##Variant


In [None]:
import pandas as pd

file_path = "/content/Credit-TRAIN-HOMONYM-0.1-0.CSV"
df = pd.read_csv(file_path)


trace_dict = df.groupby("Case")["Activity"].apply(list)

variant_dict = {}
for trace in trace_dict:
    trace_str = " -> ".join(trace)
    variant_dict[trace_str] = variant_dict.get(trace_str, 0) + 1

for i, (variant, count) in enumerate(variant_dict.items(), 1):
    print(f"Variant {i}: {variant} (Count: {count})")



with open("Trace_Variants_as_text.txt", "w") as f:
    for i, (variant, count) in enumerate(variant_dict.items(), 1):
        f.write(f"Variant {i}: {variant} (Count: {count})\n")

print("Trace Variants 전처리 완료 및 저장 완료!")


Variant 1: Check for completeness -> New online application received -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (Count: 673)
Variant 2: Check for completeness -> New online application received -> application check -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (Count: 85)
Variant 3: Check for completeness -> New online application received -> Request info -> info received -> Check for completeness -> Perform checks -> Make decision -> notify reject -> time out -> EVENT 13 END (Count: 149)
Variant 4: Check for completeness -> New online application received -> Request info -> info received -> Check for completeness -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (Count: 306)
Variant 5: Check for completeness -> receive information -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END (Count: 69)
Variant 6: Check for completeness -> New online application receive

In [None]:
import openai


client = openai.OpenAI(api_key="")

with open("Trace_Variants_as_text.txt", "r") as f:
    lines = f.readlines()

chunk_size = 10
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]

all_results = []


for i, chunk in enumerate(chunks):
    preprocessed_text = "".join(chunk)


    prompt = f"""
You are an expert in analyzing business process data. The following input contains a list of **trace variants** extracted from a business process event log.

Your task is to identify **homonym-related quality issues**, where:
- An activity name in the variant is semantically different from the standard process terminology,
- But the name is similar or identical to a correct activity (i.e., homonyms or near-homonyms),
- This includes ambiguous or misleading activity names that could be easily mistaken for others.

These types of issues often occur due to:
- Naming inconsistencies
- Use of general or vague activity terms
- Activities with names that look correct but mean something different

Please review each variant and highlight any **suspicious activity labels** that may indicate homonym errors.

For each suspected error, provide:
- Variant number (if available)
- The full activity sequence
- The activity name that may be a homonym
- The likely correct or expected term (if any)
- A brief explanation why it is considered a homonym error

Trace Variants:
{preprocessed_text}

Return your analysis in a clear and structured format.
"""

    # GPT-3.5-turbo 호출
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a business process analysis expert."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=1500
    )

    result = response.choices[0].message.content
    all_results.append(result)

    print(f"\n=== 결과 (Chunk {i+1}) ===\n{result}")

with open("anomaly_results_trace_variants.txt", "w") as f:
    for chunk_result in all_results:
        f.write(chunk_result + "\n\n")

print("\n✅ 모든 Trace Variants 결과 저장 완료: anomaly_results_trace_variants.txt")



=== 결과 (Chunk 1) ===
### Suspicious Activity Labels with Homonym Errors:

#### Variant 2:
- **Activity Sequence:** Check for completeness -> New online application received -> application check -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END
- **Suspicious Activity Label:** "application check"
- **Likely Correct Term:** "Perform checks"
- **Explanation:** "application check" may be mistaken for "Perform checks," which is a more standard term in the process.

#### Variant 5:
- **Activity Sequence:** Check for completeness -> receive information -> Perform checks -> Make decision -> Notify accept -> Deliver card -> EVENT 13 END
- **Suspicious Activity Label:** "receive information"
- **Likely Correct Term:** "New online application received"
- **Explanation:** "receive information" could be confused with "New online application received," which is the standard initial activity.

#### Variant 7:
- **Activity Sequence:** Check for completeness -> New online application r

In [None]:
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.objects.conversion.process_tree import converter as pt_converter

# 1. 컬럼명 정리
df_pm4py = df.rename(columns={
    "Case": "case:concept:name",
    "Activity": "concept:name",
    "Timestamp": "time:timestamp"
})

# 2. 타임스탬프 변환
df_pm4py["time:timestamp"] = pd.to_datetime(df_pm4py["time:timestamp"], errors='coerce')
df_pm4py = dataframe_utils.convert_timestamp_columns_in_df(df_pm4py)

# 3. 이벤트 로그로 변환
log = log_converter.apply(df_pm4py, variant=log_converter.Variants.TO_EVENT_LOG)

# 4. Inductive Miner로 Process Tree 추출
process_tree = inductive_miner.apply(log)

# 5. Process Tree → Petri Net 변환
from pm4py.objects.conversion.process_tree import converter as pt_converter
net, initial_marking, final_marking = pt_converter.apply(process_tree)

# 6. Petri Net 구조 추출
places = [f"Place: {p.name}" for p in net.places]
transitions = [f"Transition: {t.label}" for t in net.transitions if t.label is not None]
arcs = [f"Arc: {arc.source.name} -> {arc.target.name}" for arc in net.arcs]

# 7. 출력 및 저장
petri_text = places + transitions + arcs

for line in petri_text:
    print(line)

with open("Petri_Net_as_text.txt", "w") as f:
    for line in petri_text:
        f.write(line + "\n")

print("✅ Petri Net 추출 및 저장 완료!")


  return fn(*args, **kwargs)


Place: p_45
Place: p_51
Place: p_37
Place: p_42
Place: p_13
Place: p_16
Place: p_18
Place: p_14
Place: source
Place: p_5
Place: p_49
Place: p_28
Place: p_36
Place: p_44
Place: p_26
Place: p_39
Place: p_41
Place: p_50
Place: p_25
Place: p_20
Place: p_40
Place: p_29
Place: p_33
Place: p_34
Place: p_17
Place: p_19
Place: sink
Place: p_8
Place: p_9
Place: p_11
Place: p_15
Place: p_24
Place: p_35
Place: p_32
Place: p_30
Place: p_48
Place: p_27
Transition: EVENT 13 END
Transition: application check
Transition: Check for completeness
Transition: notify reject
Transition: Make decision
Transition: review request received
Transition: info received
Transition: Notify accept
Transition: Deliver card
Transition: New online application received
Transition: receive information
Transition: Request info
Transition: request information
Transition: time out
Transition: Perform checks
Transition: send notification
Arc: p_13 -> init_loop_10
Arc: tauSplit_13 -> p_17
Arc: p_24 -> skip_22
Arc: p_11 -> skip_9