In [1]:
import os
import shutil
import numpy as np
import pandas as pd
import statistics
import math

from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage



def select_features(df, labels, features, n_features):
    ## create empty lists as placeholders
    grouped_features = []
    for i in range(n_features):
        new = []
        grouped_features.append(new)

    for i in range(len(features)):
        grouped_features[labels[i]].append(features[i])

    selected_features = []
    for fs in grouped_features:
        matrix = df[fs].corr().abs()
        max_f_id = matrix.sum(axis=1).argmax()
        selected_features.append(fs[max_f_id])
        
    return selected_features

# sort the df to traces for every subject and goals
def extract_traces(dfn):
    traces = []

    Subject = 0
    Loc = 0
    Iteration = 0
    tup = (Subject, Loc, Iteration)

    for index, row in dfn.iterrows():
        curr_Subject = row["Subject"]
        curr_Loc = row["Loc"]
        curr_Iteration = row["Iteration"]
        curr_tup = (curr_Subject, curr_Loc, curr_Iteration)

        if curr_tup != tup:
            #print("new trace")
            tup = curr_tup

            rslt_df = dfn[(dfn['Subject'] == curr_Subject) 
                      & (dfn['Loc'] == curr_Loc) 
                      & (dfn['Iteration'] == curr_Iteration)]

            rslt_df.reset_index(drop=True, inplace=True)
            traces.append(rslt_df)
            
    return traces


def convert_labels_kmeans(traces, goals, subject_id):
    # generate classifiers here
    
    subtraces = []
    for goal in goals:
        subtraces_goalX = []
        for t in traces:
            if t["Subject"][0] == subject_id and t["Loc"][0] == goal:
                converted_trace = []
                
                for index,e in t.iterrows():
                    converted_trace.append( e["class"] )
                                        
                subtraces_goalX.append(converted_trace)
        subtraces.append(subtraces_goalX)
            
    return subtraces



############################# file system helpers ########################

# random.randint(0,2)

def reCreateDir(dirName):
    # Check whether the specified path exists or not
    isExist = os.path.exists(dirName)
    if isExist:
        # delete
        shutil.rmtree(dirName)
    
    os.makedirs(dirName)
    
    
# write sas_plan
def write_plan(actions, file):
    string = ""
    for a in actions:
        string += "%s\n" % (str(a))
    string += "; cost %s (unit cost)" % (str(len(actions)))
    
    file1 = open(file, "w")
    file1.write(string)
    file1.close()
    return 0
        
# write_plan(subtraces_goal2[1], "sas_plan.1")


In [13]:
# parameters:
input_data = "examplef30.csv"
index_headers = 4
n_features = 15

n_init = 20
n_clusters = 10
subject_id = 10

# recognizer param:
phi = 50
lamb = 1.5
delta = 1.0
theta = 0.9


# dependent
output_results = "example_c%s_f%s.csv" % (str(n_clusters), str(n_features))


# main script
df = pd.read_csv(input_data)

## the number of irrelevant features : index_headers
all_features = df.columns.values.tolist()[index_headers::]
df_context = df[all_features]
    
############################## select features ##############################
corr_matrix = df_context.corr().abs()
hierarchical_cluster = AgglomerativeClustering(n_clusters=n_features, affinity='euclidean', linkage='ward').fit(corr_matrix)
labels = hierarchical_cluster.labels_

selected_features = select_features(df, labels, all_features, n_features)
#selected_features = all_features
############################## classification ###########################
# analyze by subject:
df_a_subject = df.loc[df['Subject'] == subject_id].reset_index(drop=True)
df_a_subject_index = df_a_subject[["Subject", "Loc", "Iteration"]]

reduced = df_a_subject[selected_features]

# get average length  ?????  decide how many classes to set

from sklearn.cluster import KMeans
import numpy as np

# Set the random seed

if n_clusters > len(reduced):
    print("to many")
    # exit()
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=n_init).fit(reduced)
df_classes = pd.DataFrame(kmeans.labels_, columns = ['class'])
df_classified = pd.concat([df_a_subject_index, df_classes], axis=1)

traces = extract_traces(df_classified)    
goals = list(df_classified["Loc"].unique())
goals.sort()
subtraces = convert_labels_kmeans(traces, goals, subject_id)



In [14]:
subtraces[0]

[[8, 1, 2, 4, 2, 7, 1],
 [8, 7, 4, 4, 2, 7, 1, 8, 8],
 [8, 1, 2, 4, 4, 7, 1, 8],
 [1, 2, 4, 4, 2, 7, 1, 8]]

In [15]:
subtraces[1]

[[1, 7, 0, 9, 0, 5, 6, 6, 6],
 [8, 1, 7, 0, 3, 3, 5, 6],
 [8, 7, 0, 9, 9, 0, 5, 6, 6],
 [1, 7, 4, 9, 9, 0, 5, 6]]

In [16]:
os.system("rm -rf %s" % output_results)


reCreateDir("test")
reCreateDir("model")

test_id = 3
goal_id = 0
for a_goal in subtraces:
    reCreateDir("goal_%s" % str(goal_id) )
    trace_id = 0
    for a_trace in a_goal:
        write_plan(a_trace, "goal_%s/sas_plan.%s" % (str(goal_id), str(trace_id)) )
        trace_id += 1

    # test
    os.system("mv goal_%s/sas_plan.%s test/sas_plan.%s" % (str(goal_id), str(test_id), str(goal_id)) )
    # model
    os.system("java -jar sas2xes.jar goal_%s model/%s.xes" % (str(goal_id), str(goal_id)) )
    goal_id += 1

os.chdir("./miningPNMLS")
os.system("java -jar mine_all_pnmls.jar -DFM ../model/ 0.8")
os.chdir("../")


for i in range(len(subtraces)):
    # goal_id, goal_id, percentage
    for percentage in [0.1,0.3,0.5,0.7,1.0]:
        os.system("java -jar recognizer.jar -w model/ test/sas_plan.%s %s %s %s %s %s %s %s" 
                  %(str(i), str(i), str(percentage), str(phi), str(lamb), str(delta), str(theta), str(output_results) )   )  

os.system("rm -rf Feedback")



miner starts
../model/0.xes
../model/1.xes
mining complete
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done
model/0.xes.pnml : indexed
model/1.xes.pnml : indexed
done


0