In [None]:
!pip install pm4py
!pip install kaggle


In [None]:
from google.colab import files
files.upload()



In [None]:
import zipfile

with zipfile.ZipFile("archive (2) (1).zip", 'r') as zip_ref:
    zip_ref.extractall("dataset_folder")

print("Extraction complete.")


In [None]:
import os
os.listdir("dataset_folder")


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("dataset_folder/bpi_2017_cleaned.csv")

# Preview first 5 rows
df.head()


In [None]:
df.info()


In [None]:
import pandas as pd
# Keep only required columns
df_process = df[[
    "case:concept:name",
    "concept:name",
    "time:timestamp"
]].copy()

# Rename columns
df_process.columns = ["case_id", "activity", "timestamp"]

# Convert timestamp properly
df_process["timestamp"] = pd.to_datetime(
    df_process["timestamp"],
    format="ISO8601"
)

# Sort chronologically
df_process = df_process.sort_values(["case_id", "timestamp"])

df_process.head()



In [None]:
df_process["case_id"].nunique()


In [None]:
df_process.describe()


In [None]:
df_process.isnull().sum()


In [None]:
import pm4py

event_log = pm4py.format_dataframe(
    df_process,
    case_id='case_id',
    activity_key='activity',
    timestamp_key='timestamp'
)

print("Formatting complete.")


In [None]:
event_log = pm4py.convert_to_event_log(event_log)

print("Conversion complete.")


In [None]:
process_model = pm4py.discover_process_tree_inductive(event_log)

print("Process model discovered.")


In [None]:
pm4py.view_process_tree(process_model)


In [None]:
process_tree = pm4py.discover_process_tree_inductive(event_log)

print("Process tree discovered.")


In [None]:
pm4py.view_process_tree(process_tree)


In [None]:
# Calculate case duration (in days)

case_durations = df_process.groupby("case_id")["timestamp"].agg(["min", "max"])
case_durations["duration_days"] = (
    case_durations["max"] - case_durations["min"]
).dt.total_seconds() / (60*60*24)

case_durations.head()


In [None]:
case_durations["duration_days"].describe()


In [None]:
activity_counts = df_process["activity"].value_counts()

activity_counts.head(10)


In [None]:
# Calculate time difference between consecutive events within each case
df_process["prev_timestamp"] = df_process.groupby("case_id")["timestamp"].shift(1)

df_process["waiting_time_hours"] = (
    df_process["timestamp"] - df_process["prev_timestamp"]
).dt.total_seconds() / 3600

df_process.head()


In [None]:
activity_waiting = df_process.groupby("activity")["waiting_time_hours"].mean().sort_values(ascending=False)

activity_waiting.head(10)


In [None]:
# Count activity repetitions per case
activity_repeats = (
    df_process.groupby(["case_id", "activity"])
    .size()
    .reset_index(name="count")
)

# Filter where activity occurs more than once in a case
rework_cases = activity_repeats[activity_repeats["count"] > 1]

rework_summary = rework_cases["activity"].value_counts()

rework_summary.head(10)


In [None]:
variants = pm4py.get_variants(event_log)

len(variants)


In [None]:
variant_counts = {k: len(v) for k, v in variants.items()}
sorted_variants = sorted(variant_counts.items(), key=lambda x: x[1], reverse=True)

sorted_variants[:5]


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(case_durations["duration_days"], bins=50)
plt.xlabel("Case Duration (Days)")
plt.ylabel("Number of Cases")
plt.title("Distribution of Case Duration")
plt.show()


In [None]:
# Take top 10 bottleneck activities
top_waiting = activity_waiting.head(10)

plt.figure()
plt.bar(top_waiting.index, top_waiting.values)
plt.xticks(rotation=90)
plt.xlabel("Activity")
plt.ylabel("Average Waiting Time (Hours)")
plt.title("Top 10 Bottleneck Activities")
plt.show()


In [None]:
# Merge case duration with final activity status
last_activity = df_process.sort_values("timestamp").groupby("case_id").last().reset_index()

final_df = case_durations.merge(
    last_activity[["case_id", "activity"]],
    on="case_id"
)

# Filter only Accepted and Cancelled
comparison = final_df[final_df["activity"].isin(["A_Accepted", "A_Cancelled"])]

# Plot
plt.figure()

accepted = comparison[comparison["activity"] == "A_Accepted"]["duration_days"]
cancelled = comparison[comparison["activity"] == "A_Cancelled"]["duration_days"]

plt.hist(accepted, bins=40)
plt.hist(cancelled, bins=40)

plt.xlabel("Case Duration (Days)")
plt.ylabel("Number of Cases")
plt.title("Accepted vs Cancelled Case Duration")
plt.show()


In [None]:
last_activity["activity"].value_counts().head(15)



In [None]:
# Filter cancelled cases
cancelled_cases = final_df[final_df["activity"] == "O_Cancelled"]

print("Cancelled cases:", len(cancelled_cases))
print(cancelled_cases["duration_days"].describe())


In [None]:
validate_cases = final_df[final_df["activity"] == "W_Validate application"]

print("Validate cases:", len(validate_cases))
print(validate_cases["duration_days"].describe())


In [None]:
plt.figure()
validate_cases["duration_days"].hist(bins=50)
plt.xlabel("Validation Duration (Days)")
plt.ylabel("Frequency")
plt.title("Distribution of Validation Stage Duration")
plt.show()


In [None]:
# Define threshold for extreme delays (example: > 30 days)
long_cases = validate_cases[validate_cases["duration_days"] > 30]

print("Cases > 30 days:", len(long_cases))
print("Percentage:", len(long_cases) / len(validate_cases) * 100)


In [None]:
normal_cases_ids = validate_cases[validate_cases["duration_days"] <= 30]["case_id"].unique()
long_cases_ids = validate_cases[validate_cases["duration_days"] > 30]["case_id"].unique()

print("Normal cases:", len(normal_cases_ids))
print("Long cases:", len(long_cases_ids))


In [None]:
print(final_df.head())
print(final_df.columns)



In [None]:
df



In [None]:
event_log_df["timestamp"] = pd.to_datetime(
    event_log_df["timestamp"],
    format="mixed",
    utc=True
)
print(event_log_df["timestamp"].head())
print(event_log_df["timestamp"].dtype)



In [None]:
from pm4py.objects.conversion.log import converter as log_converter

parameters = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: "case_id"
}

event_log = log_converter.apply(
    event_log_df,
    parameters=parameters
)

print(type(event_log))
print("Number of cases:", len(event_log))



In [54]:
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer


In [58]:
event_log_df_pm = event_log_df.rename(columns={
    "case_id": "case:concept:name",
    "activity": "concept:name",
    "timestamp": "time:timestamp"
})


In [None]:
from pm4py.objects.conversion.log import converter as log_converter

event_log = log_converter.apply(event_log_df_pm)

print(type(event_log))
print("Number of cases:", len(event_log))
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.visualization.petri_net import visualizer as pn_visualizer

# Step 1: Discover process tree
process_tree = inductive_miner.apply(event_log)

print(type(process_tree))


In [None]:
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.visualization.petri_net import visualizer as pn_visualizer

# Convert process tree to Petri net
net, initial_marking, final_marking = pt_converter.apply(process_tree)

# Visualize
gviz = pn_visualizer.apply(net, initial_marking, final_marking)
pn_visualizer.view(gviz)


In [None]:
from pm4py.statistics.variants.log import get as variants_get

variants = variants_get.get_variants(event_log)

print("Total variants:", len(variants))

# Sort by frequency
sorted_variants = sorted(variants.items(), key=lambda x: len(x[1]), reverse=True)

# Print top 5 variants
for i, (variant, cases) in enumerate(sorted_variants[:5]):
    print(f"\nVariant {i+1}")
    print("Frequency:", len(cases))
    print("Path:", variant)


In [None]:
normal_df = event_log_df[event_log_df["case_id"].isin(normal_cases_ids)]
long_df = event_log_df[event_log_df["case_id"].isin(long_cases_ids)]
normal_df = normal_df.rename(columns={
    "case_id": "case:concept:name",
    "activity": "concept:name",
    "timestamp": "time:timestamp"
})

long_df = long_df.rename(columns={
    "case_id": "case:concept:name",
    "activity": "concept:name",
    "timestamp": "time:timestamp"
})
from pm4py.objects.conversion.log import converter as log_converter

normal_log = log_converter.apply(normal_df)
long_log = log_converter.apply(long_df)
normal_freq = activity_frequency(normal_log)
long_freq = activity_frequency(long_log)
print("Normal cases activity frequency:")
print(normal_freq)

print("\nLong cases activity frequency:")
print(long_freq)






In [71]:
normal_avg = {k: v/11840 for k, v in normal_freq.items()}
long_avg = {k: v/822 for k, v in long_freq.items()}


In [None]:
normal_case_count = 11840
long_case_count = 822

normal_avg = {k: v/normal_case_count for k, v in normal_freq.items()}
long_avg = {k: v/long_case_count for k, v in long_freq.items()}
for activity in long_avg:
    diff = long_avg.get(activity, 0) - normal_avg.get(activity, 0)
    if abs(diff) > 1:
        print(activity, "Difference per case:", round(diff, 2))
def count_rework(log, activity_name):
    counts = []
    for trace in log:
        c = sum(1 for e in trace if e["concept:name"] == activity_name)
        counts.append(c)
    return sum(counts)/len(counts)

print("Normal avg W_Call after offers:",
      count_rework(normal_log, "W_Call after offers"))

print("Long avg W_Call after offers:",
      count_rework(long_log, "W_Call after offers"))
from collections import Counter

def end_activity_distribution(log):
    ends = [trace[-1]["concept:name"] for trace in log]
    return Counter(ends)

print("Normal end:", end_activity_distribution(normal_log))
print("Long end:", end_activity_distribution(long_log))




In [None]:
def count_activity_per_trace(log, activity_name):
    counts = []
    for trace in log:
        c = sum(1 for e in trace if e["concept:name"] == activity_name)
        counts.append(c)
    return counts

long_incomplete_counts = count_activity_per_trace(long_log, "W_Call incomplete files")

sum(1 for c in long_incomplete_counts if c > 5) / len(long_incomplete_counts) * 100
