In [1]:
import pandas as pd
import stumpy
import numpy as np
import time
from util.util import *
csvPath = "logs/smartRPA/"
validation_path = csvPath + "validation/"

### Creation of artificial validation logs for TSDM Discovery in UI logs

Method:
  1. Get user interactions (a) and create a set of user actions (A)
  2. Select random actions (1 to n consequtive actions per looping) append them into a dataframe (D) until a upper limit (x) is reached
        - The upper limit x is considered as 1 action per 3 seconds in a 8 hour work day => 8* 60 * (60/3) = 9600 actions a day
  4. Get routines (r) (1-m overall) and insert the routines (r) o-times at random points into the dataframe (D)
        - The routines need not interrupt themselfs, otherwise no motif could be discovered (for future tests, the could interrupt as well)

Result: A dataframe (D) with x + (o * len(r)) number of actions containing m routines at random points

We create a set of UI logs which vary in the following constraints:
- Randomness of actions inserted into the logs
    - The actions can be sampled completly randomly
    - The actions can be consequitive actions taken from previous logs
- Number of motifs inserted
    - The length of the motif is defined by the original motif in the smartRPA log
- Number of times the motifs are inserted
- Length of the UI log

Objective: Create a dataframe that mimics a long time recording of users, which contains routines



The file name will be created as follows
"Log_ [#Randomness]_ [#Motifs] _[#MotifOccurance] _[#LogLength]_[#ShuffledBy]_[#ReducedBy].csv"

In [2]:
randomness = [5,10,25] # how long should the original action sequences be
motifs = [1,2,5] # how many different motifs should be inserted into the log
occurances = [2,5,10] # count of the motif occurances in the log
lengthLog = [28800] # [9600,19200,28800] # 9600 are Events for approximatly one working day

# ToDo or just take the original size of the case to make it more real world
lengthMotifs = [10,15,20,25] # Could be added to enter different length motifs into the data

# Shuffle and reduction of the event log
shuffle = 10
reduce = 10

concept_name_column = 'case:concept:name'
timeStampCol = "time:timestamp"

In [3]:
dfAll = read_csvs_and_combine(csvPath,10000)
# Drop duplicates based on equality assumption in https://doi.org/10.1016/j.compind.2022.103721          
subset=["category","application","concept:name","event_src_path","event_dest_path","browser_url","xpath"]
df_unique = dfAll.drop_duplicates(subset=subset)
print(f"There are {df_unique.shape[0]} unique events in the dataframe.")

data = {'Filename': [], 'Index': [], 'CaseOrder': []}
index_frame = pd.DataFrame(data)
for rand in randomness:
    for mot in motifs: # Number of motifs
        for occ in occurances:
            for l in lengthLog: # do not name len as shortcut, cause trouble with inbuild length function len()
                random_cases_list = get_random_values(dfAll, concept_name_column, mot, min_len=15)
                uiLog = get_rand_uiLog(dfAll, n_max=rand, actions=l) # set to actions=l for proper work

                # Filter rows with values in the list, because the length is shorter for the following loop
                filtered_df = dfAll[dfAll[concept_name_column].isin(random_cases_list)]

                uiLog, indices, random_cases_list = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                           uiLog=uiLog,
                                                           dfcases=filtered_df,
                                                           occurances=occ,
                                                           case_column_name=concept_name_column,
                                                           sorted_insert_col=timeStampCol,
                                                           shuffled=True,
                                                           shuffled_by=shuffle,
                                                           reduced=True,
                                                           reduced_by=reduce)
                filename = f"Log_{rand}_{mot}_{occ}_{l}_{shuffle}_{reduce}.csv"
                filepath = validation_path + filename
                print(filename)
                uiLog.to_csv(filepath, index=False)

                # For tracking purpose and validation: Store the index of the cases in each log
                row = (filename, str(indices), str(random_cases_list)) 
                new_row_series = pd.Series(row, index=index_frame.columns)
                index_frame = pd.concat([index_frame, new_row_series.to_frame().T], ignore_index=True)

filepath = validation_path + "validation_data.csv"
index_frame.to_csv(filepath, index=False)

Maximum row limit of 10000 reached. Stopping reading additional files.
There are 1092 unique events in the dataframe.
After: Index loop i = 0, Len UI Log = 28823, random cases list len = 2, indices = [8762, 312]
After: Index loop i = 1, Len UI Log = 28846, random cases list len = 2, indices = [8785, 312]
Log_5_1_2_28800_10_10.csv
After: Index loop i = 0, Len UI Log = 28823, random cases list len = 5, indices = [25917, 23314, 19127, 16726, 12193]
After: Index loop i = 1, Len UI Log = 28846, random cases list len = 5, indices = [25940, 23314, 19127, 16726, 12193]
After: Index loop i = 2, Len UI Log = 28869, random cases list len = 5, indices = [25963, 23337, 19127, 16726, 12193]
After: Index loop i = 3, Len UI Log = 28892, random cases list len = 5, indices = [25986, 23360, 19150, 16726, 12193]
After: Index loop i = 4, Len UI Log = 28915, random cases list len = 5, indices = [26009, 23383, 19173, 16749, 12193]
Log_5_1_5_28800_10_10.csv
After: Index loop i = 0, Len UI Log = 28823, random 

The previously part created motifs have all the same length. However, the data in our approach can handle different length motifs as well.
We use the work in doi.org/10.1109/ACCESS.2023.3295995 to identif such motifs.
The logs created here contain different length sized motifs, resembling different length user tasks.

In [4]:
dfAll = read_csvs_and_combine(csvPath,10000)
# Drop duplicates based on equality assumption in https://doi.org/10.1016/j.compind.2022.103721          
subset=["category","application","concept:name","event_src_path","event_dest_path","browser_url","xpath"]
df_unique = dfAll.drop_duplicates(subset=subset)
print(f"There are {df_unique.shape[0]} unique events in the dataframe.")

data = {'Filename': [], 'Index': [], 'CaseOrder': [], 'CaseLength':[]}
index_frame = pd.DataFrame(data)
for rand in randomness:
        for occ in occurances:
            for l in lengthLog: # do not name len as shortcut, cause trouble with inbuild length function len()
                mot = len(lengthMotifs)
                random_cases_list = get_random_values(dfAll, concept_name_column, mot, min_len=max(lengthMotifs))
                filtered_df = dfAll[dfAll[concept_name_column].isin(random_cases_list)]

                # Reduce the cases in length and append again
                strCaseLength = ""
                for i, element in enumerate(lengthMotifs):
                    insert_df = filtered_df[filtered_df[concept_name_column] == random_cases_list[i]].sort_values(timeStampCol)
                    try:
                        variableLengthDf = pd.concat([insert_df.iloc[:element], variableLengthDf], ignore_index=True)
                    except NameError:
                        variableLengthDf = insert_df.iloc[:element-1]
                    strCaseLength = strCaseLength + f"{random_cases_list[i]}:{element}/"

                uiLog, indices, random_cases_list = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                           uiLog=uiLog,
                                                           dfcases=variableLengthDf,
                                                           occurances=occ,
                                                           case_column_name=concept_name_column,
                                                           sorted_insert_col=timeStampCol,
                                                           shuffled=True,
                                                           shuffled_by=shuffle,
                                                           reduced=True,
                                                           reduced_by=reduce)
            
                filename = f"Var_Len_Log_{rand}_{mot}_{occ}_{l}_{shuffle}_{reduce}.csv"
                filepath = validation_path + filename
                uiLog.to_csv(filepath, index=False)

                # For tracking purpose and validation: Store the index of the cases in each log
                row = (filename, str(indices), str(random_cases_list),strCaseLength) 
                new_row_series = pd.Series(row, index=index_frame.columns)
                index_frame = pd.concat([index_frame, new_row_series.to_frame().T], ignore_index=True)

filepath = validation_path + "var_len_validation_data.csv"
index_frame.to_csv(filepath, index=False)

Maximum row limit of 10000 reached. Stopping reading additional files.
There are 1092 unique events in the dataframe.
After: Index loop i = 0, Len UI Log = 29964, random cases list len = 8, indices = [28000, 26796, 26434, 15588, 13087, 11234, 10206, 9334]
After: Index loop i = 1, Len UI Log = 29987, random cases list len = 8, indices = [28023, 26796, 26434, 15588, 13087, 11234, 10206, 9334]
After: Index loop i = 2, Len UI Log = 30001, random cases list len = 8, indices = [28037, 26810, 26434, 15588, 13087, 11234, 10206, 9334]
After: Index loop i = 3, Len UI Log = 30019, random cases list len = 8, indices = [28055, 26828, 26452, 15588, 13087, 11234, 10206, 9334]
After: Index loop i = 4, Len UI Log = 30028, random cases list len = 8, indices = [28064, 26837, 26461, 15597, 13087, 11234, 10206, 9334]
After: Index loop i = 5, Len UI Log = 30051, random cases list len = 8, indices = [28087, 26860, 26484, 15620, 13110, 11234, 10206, 9334]
After: Index loop i = 6, Len UI Log = 30069, random ca