In [270]:
import pandas as pd
import os
import shutil

def reCreateDir(dirName):
    # Check whether the specified path exists or not
    isExist = os.path.exists(dirName)
    if isExist:
        # delete
        shutil.rmtree(dirName)
    
    os.makedirs(dirName)
    
def writePlan(trace_df, file_path):
    with open(file_path, 'w') as file:
        for activity in trace_df["Activity"]:
            file.write(activity.lower() + '\n')
        file.write(';cost\n')


########## group to traces #######
# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = 'sepsis_cases_1.csv'

# Read the CSV file line by line
with open(csv_file_path, 'r') as file:
    lines = file.readlines()

# Create a list to store the rows
data = []

# Iterate through the lines and split them into columns
for line in lines[1::]:
    row = line.strip().split(';')  # Assuming comma-separated values
    data.append(row)

# Create a pandas DataFrame from the list of rows
df = pd.DataFrame(data)

df.columns = lines[0].strip().split(';')


####### sort traces ###############
curr_case_id = False
t = False
trace_time_tuple = []

data_each_trace = []
for index, row in df.iterrows():
    case_id = row.loc['Case ID']
    activity = row.loc['Activity']
    label = row.loc['label']
    last_t = t
    t = row.loc['time:timestamp']
    
    if case_id != curr_case_id:

        if data_each_trace:
            dft = pd.DataFrame(data_each_trace)
            dft.columns = ['Case ID', 'label', 'Activity', 'time:timestamp']
            trace_time_tuple.append((last_t, dft))

        data_each_trace = []
        curr_case_id = case_id

    data_each_trace.append([case_id, label, activity, t])
        
dft = pd.DataFrame(data_each_trace)
dft.columns = ['Case ID', 'label', 'Activity', 'time:timestamp']
trace_time_tuple.append((t, dft))

sorted_tuples = sorted(trace_time_tuple, key=lambda x: x[0])


############# group the traces in timeline order and group according to goal ###########
inbalance_threshold = 5

label_0 = []
label_1 = []

for tu in sorted_tuples:
    if tu[1].iloc[0]['label'] == "regular" and len(label_0) - len(label_1) < 0:
        label_0.append(tu[1])
    if tu[1].iloc[0]['label'] == "deviant" and len(label_1) - len(label_0) < inbalance_threshold:
        label_1.append(tu[1])
        
      
    
############ generate training and testing datasets ############
    
reCreateDir("training")
reCreateDir("training/goal_0")
reCreateDir("training/goal_1")
reCreateDir("testing")

trainSize = 10

traceCount = 0
for tr in label_0:
    traceCount += 1
    if traceCount <= trainSize:
        file_path = "training/goal_0/sas_plan.%s" % str(traceCount)
    else:
        file_path = "testing/sas_plan_0.%s" % str( (traceCount-10)*2 - 1 )
    writePlan(tr, file_path)
    
traceCount = 0
for tr in label_1:
    traceCount += 1
    if traceCount <= trainSize:
        file_path = "training/goal_1/sas_plan.%s" % str(traceCount)
    else:
        file_path = "testing/sas_plan_1.%s" % str( (traceCount-10)*2 )
    writePlan(tr, file_path)

In [271]:
################# convert to XES and mine the model ###################

os.system("java -jar ../sas2xes.jar training/goal_0 training/goal_0.xes")
os.system("java -cp ../miner.jar autoMiner -DFM training/goal_0.xes training/goal_0.xes.pnml 0.8")


os.system("java -jar ../sas2xes.jar training/goal_1 training/goal_1.xes")
os.system("java -cp ../miner.jar autoMiner -DFM training/goal_1.xes training/goal_1.xes.pnml 0.8")

ERROR StatusLogger No log4j2 configuration file found. Using default configuration: logging only errors to the console. Set system property 'org.apache.logging.log4j.simplelog.StatusLogger.level' to TRACE to show Log4j2 internal initialization logging.


MineDFM: training/goal_0.xes.pnml    Success!


ERROR StatusLogger No log4j2 configuration file found. Using default configuration: logging only errors to the console. Set system property 'org.apache.logging.log4j.simplelog.StatusLogger.level' to TRACE to show Log4j2 internal initialization logging.


MineDFM: training/goal_1.xes.pnml    Success!


0

In [1]:
import os
import re
import subprocess
import numpy as np
import pandas as pd
import statistics
import shutil

from sklearn import linear_model

def func_precision(stringList, answer):
    goal_count = 0
    found = 0
    for result in stringList:
        if result == str(answer):
            found = 1
        goal_count += 1

    return found/(goal_count-1)

def func_recall(stringList, answer):
    found = 0
    for result in stringList:
        if result == str(answer):
            found = 1
            break
    return found

def func_accuracy(total, stringList, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for result in stringList[0:-1]:
        if result == str(answer):
            tp += 1
        else:
            fp += 1
    
    fn = 1 - tp
    
    # total is the number of all goals
    tn = total - tp - fp - fn
    return (tp + tn)/(tn + tp + fp + fn)


def func_bacc(total, stringList, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for result in stringList[0:-1]:
        if result == str(answer):
            tp += 1
        else:
            fp += 1
    
    fn = 1 - tp
    
    # total is the number of all goals
    tn = total - tp - fp - fn

    tpr = tp/(tp + fn)
    tnr = tn/(tn + fp)
    bacc = (tpr + tnr)/2

    return bacc


# return a list of each statistics for every testing case
def calculate_statistics(rows):
    length = rows.shape[0]

    precision = []
    recall = []
    accuracy = []
    b_accuracy = []
        
    for index, row in rows.iterrows():
        
        answer = row["Real_Goal"]
        results = row["Results"].split("/")
        all_candidates = row["Cost"].split("/")
        
        total = len(all_candidates)-1   # the last one is /
        
        p = func_precision(results, answer)
        r = func_recall(results, answer)
        a = func_accuracy(total, results, answer)
        bacc = func_bacc(total, results, answer)
        
        precision.append(p)
        recall.append(r)
        accuracy.append(a)
        b_accuracy.append(bacc)
    
    return precision, recall, accuracy, b_accuracy




In [8]:
# sepsis 2 delete middle
rdf = pd.read_csv("results_sepsis_2/out-no-relearn-sepsis_cases_2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)

ori_acc = statistics.mean(bacc[0:40])

print("no relearn:" )
print(statistics.mean(bacc[0:40]) )
print(statistics.mean(bacc[161:200]) )

rdf = pd.read_csv("results_sepsis_2/out-openloop-sepsis_cases_2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("open loop relearn:" )
print(statistics.mean(bacc[0:40]) )
print(statistics.mean(bacc[161:200]) )

rdf = pd.read_csv("results_sepsis_2/out-closedloop_ave_metric-sepsis_cases_2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop ave relearn:" )
print(statistics.mean(bacc[0:40]) )
print(statistics.mean(bacc[161:200]) )

rdf = pd.read_csv("results_sepsis_2/out-closedloop-trend-sepsis_cases_2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop trend relearn:" )
print(statistics.mean(bacc[0:40]) )
print(statistics.mean(bacc[161:200]) )

no relearn:
0.7125
0.6515151515151515
open loop relearn:
0.75
0.7121212121212122
closed loop ave relearn:
0.7125
0.7575757575757576
closed loop trend relearn:
0.7125
0.7272727272727273


In [3]:
obs = 1.0
# BPIC11_f2
rdf = pd.read_csv("results_BPIC11_f2/out-no-relearn-BPIC11_f2_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("no relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_BPIC11_f2/out-openloop-BPIC11_f2_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("open loop relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_BPIC11_f2/out-closedloop_ave_metric-BPIC11_f2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop ave relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_BPIC11_f2/out-closedloop-trend-BPIC11_f2_%s.csv" % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop trend relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

no relearn:
0.54
0.42424242424242425
open loop relearn:
0.58
0.5656565656565656
closed loop ave relearn:
0.57
0.5353535353535354
closed loop trend relearn:
0.61
0.601010101010101


In [4]:
# hospital 3
rdf = pd.read_csv("results_hospital_3/out-no-relearn-hospital_billing_3_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("no relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_hospital_3/out-openloop-hospital_billing_3_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("open loop relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_hospital_3/out-closedloop_ave_metric-hospital_billing_3_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop ave relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

rdf = pd.read_csv("results_hospital_3/out-closedloop-trend-hospital_billing_3_%s.csv"  % obs)
p, r, a, bacc = calculate_statistics(rdf)
print("closed loop trend relearn:" )
print(statistics.mean(bacc[0:100]) )
print(statistics.mean(bacc[101:200]) )

no relearn:
0.72
0.6363636363636364
open loop relearn:
0.665
0.7272727272727273
closed loop ave relearn:
0.72
0.7171717171717171
closed loop trend relearn:
0.665
0.7323232323232324
