In [1]:
import pandas as pd
import stumpy
import numpy as np
import time
from util.util import *
csvPath = "logs/smartRPA/"
validation_path = csvPath + "validation/"

### Creation of artificial validation logs for TSDM Discovery in UI logs

Method:
  1. Get user interactions (a) and create a set of user actions (A)
  2. Select random actions (1 to n consequtive actions per looping) append them into a dataframe (D) until a upper limit (x) is reached
        - The upper limit x is considered as 1 action per 3 seconds in a 8 hour work day => 8* 60 * (60/3) = 9600 actions a day
  4. Get routines (r) (1-m overall) and insert the routines (r) o-times at random points into the dataframe (D)
        - The routines need not interrupt themselfs, otherwise no motif could be discovered (for future tests, the could interrupt as well)

Result: A dataframe (D) with x + (o * len(r)) number of actions containing m routines at random points

We create a set of UI logs which vary in the following constraints:
- Randomness of actions inserted into the logs
    - The actions can be sampled completly randomly
    - The actions can be consequitive actions taken from previous logs
- Number of motifs inserted
    - The length of the motif is defined by the original motif in the smartRPA log
- Number of times the motifs are inserted
- Length of the UI log

Objective: Create a dataframe that mimics a long time recording of users, which contains routines



The file name will be created as follows
"Log_ [#Randomness]_ [#Motifs] _[#MotifOccurance] _[#LogLength].csv"

In [2]:
randomness = [1,5,10] # how long should the original action sequences be
motifs = [1,2,5] # how many different motifs should be inserted into the log
occurances = [2,3,5,10] # count of the motif occurances in the log
lengthLog = [9600,19200,28800] # 9600 are Events for approximatly one working day

# ToDo or just take the original size of the case to make it more real world
lengthMotifs = [10,15,20,25] # Could be added to enter different length motifs into the data

column_name = 'case:concept:name'
timeStampCol = "time:timestamp"

In [4]:
dfAll = read_csvs_and_combine(csvPath,10000)
# Drop duplicates based on equality assumption in https://doi.org/10.1016/j.compind.2022.103721          
subset=["category","application","concept:name","event_src_path","event_dest_path","browser_url","xpath"]
df_unique = dfAll.drop_duplicates(subset=subset)
print(f"There are {df_unique.shape[0]} unique events in the dataframe.")

data = {'Filename': [], 'Index': []}
index_frame = pd.DataFrame(data)
for rand in randomness:
    for mot in motifs:
        for occ in occurances:
            for l in lengthLog: # do not name len as shortcut, cause trouble with inbuild length function len()
                filename = f"Log_{rand}_{mot}_{occ}_{l}.csv"
                print(filename)
                random_cases_list = get_random_values(dfAll, column_name, mot, min_len=15)
                uiLog = get_rand_uiLog(dfAll, n_max=rand, actions=l) # set to actions=l for proper work
                uiLog, indices = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                           uiLog=uiLog,
                                                           dfcases=dfAll,
                                                           occurances=occ,
                                                           case_column_name=column_name,
                                                           sorted_insert_col=timeStampCol,
                                                           shuffled=True,
                                                           shuffled_by=10,
                                                           reduced=True,
                                                           reduced_by=10)
                filepath = validation_path + filename
                uiLog.to_csv(filepath, index=False)

                # For tracking purpose and validation: Store the index of the cases in each log
                row = (filename, str(indices)) 
                new_row_series = pd.Series(row, index=index_frame.columns)
                index_frame = pd.concat([index_frame, new_row_series.to_frame().T], ignore_index=True)

filepath = validation_path + "validation_data.csv"
index_frame.to_csv(filepath, index=False)

Maximum row limit of 10000 reached. Stopping reading additional files.
There are 1092 unique events in the dataframe.
Log_1_1_2_9600.csv
After: Index loop i = 0, Len UI Log = 9623, random cases list len = 2, indices = [7420, 6368]
After: Index loop i = 1, Len UI Log = 9646, random cases list len = 2, indices = [7443, 6368]
Log_1_1_2_19200.csv
After: Index loop i = 0, Len UI Log = 19223, random cases list len = 2, indices = [14128, 1412]
After: Index loop i = 1, Len UI Log = 19246, random cases list len = 2, indices = [14151, 1412]
Log_1_1_2_28800.csv
After: Index loop i = 0, Len UI Log = 28823, random cases list len = 2, indices = [23805, 14982]
After: Index loop i = 1, Len UI Log = 28846, random cases list len = 2, indices = [23828, 14982]
Log_1_1_3_9600.csv
After: Index loop i = 0, Len UI Log = 9623, random cases list len = 3, indices = [6764, 5912, 5252]
After: Index loop i = 1, Len UI Log = 9646, random cases list len = 3, indices = [6787, 5912, 5252]
After: Index loop i = 2, Len U