In [3]:
import pandas as pd
import stumpy
import numpy as np
import time
from util.util import *
csvPath = "logs/smartRPA/"
validation_path = csvPath + "validation/"

### Creation of artificial validation logs for TSDM Discovery in UI logs

Method:
  1. Get user interactions (a) and create a set of user actions (A)
  2. Select random actions (1 to n consequtive actions per looping) append them into a dataframe (D) until a upper limit (x) is reached
        - The upper limit x is considered as 1 action per 3 seconds in a 8 hour work day => 8* 60 * (60/3) = 9600 actions a day
  4. Get routines (r) (1-m overall) and insert the routines (r) o-times at random points into the dataframe (D)
        - The routines need not interrupt themselfs, otherwise no motif could be discovered (for future tests, the could interrupt as well)

Result: A dataframe (D) with x + (o * len(r)) number of actions containing m routines at random points

We create a set of UI logs which vary in the following constraints:
- Randomness of actions inserted into the logs
    - The actions can be sampled completly randomly
    - The actions can be consequitive actions taken from previous logs
- Number of motifs inserted
    - The length of the motif is defined by the original motif in the smartRPA log
- Number of times the motifs are inserted
- Length of the UI log

Objective: Create a dataframe that mimics a long time recording of users, which contains routines



The file name will be created as follows
"Log_ [#Randomness]_ [#Motifs] _[#MotifOccurance] _[#LogLength]_[#ShuffledBy]_[#ReducedBy].csv"

In [4]:
randomness = [1] # how long should the original action sequences be
motifs = [1] # how many different motifs should be inserted into the log
occurances = [10,15,20,30,60] #motifs count of the motif occurances in the log
lengthLog = [1000,2000,4000,8000,12000,15000,17500,20000,25000,30000] # 9600 are Events for approximatly one working day
percentageMotifsOverLog = [1,2.5,5,10]

# ToDo or just take the original size of the case to make it more real world
lengthMotifs = [5,10,15,20,25] # Could be added to enter different length motifs into the data

# Shuffle and reduction of the event log
shuffle = 0
reduce = 0

concept_name_column = 'case:concept:name'
timeStampCol = "time:timestamp"

dfAll = read_csvs_and_combine(csvPath,1000000)
# Drop duplicates based on equality assumption in https://doi.org/10.1016/j.compind.2022.103721          
subset=["category","application","concept:name","event_src_path","event_dest_path","browser_url","xpath"]
df_unique = dfAll.drop_duplicates(subset=subset)
print(f"There are {df_unique.shape[0]} unique events in the dataframe.")

There are 14498 unique events in the dataframe.


In [None]:
data = {'Filename': [], 'Index': [], 'CaseOrder': []}
index_frame = pd.DataFrame(data)
for rand in randomness:
    # Not used at the moment as we have changed to percentage based calculation
    for l in lengthLog: # do not name len as shortcut, cause trouble with inbuild length function len()
        beginningLog = get_rand_uiLog(dfAll, n_max=rand, actions=l) # set to actions=l for proper work
        for occ in occurances:
            for mot in motifs: # Number of motifs
                uiLog = beginningLog
                random_cases_list = get_random_values(dfAll, concept_name_column, mot, min_len=15)
                # Filter rows with values in the list, because the length is shorter for the following loop
                filtered_df = dfAll[dfAll[concept_name_column].isin(random_cases_list)]

                uiLog, indices, random_cases_list = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                           uiLog=uiLog,
                                                           dfcases=filtered_df,
                                                           occurances=occ,
                                                           case_column_name=concept_name_column,
                                                           sorted_insert_col=timeStampCol,
                                                           shuffled=False,
                                                           shuffled_by=shuffle,
                                                           reduced=False,
                                                           reduced_by=reduce)
                filename = f"Log_{rand}_{mot}_{occ}_{l}_{shuffle}_{reduce}_.csv"
                filepath = validation_path + filename
                print(filename)
                uiLog.to_csv(filepath, index=False)

                # For tracking purpose and validation: Store the index of the cases in each log
                row = (filename, str(indices), str(random_cases_list)) 
                new_row_series = pd.Series(row, index=index_frame.columns)
                index_frame = pd.concat([index_frame, new_row_series.to_frame().T], ignore_index=True)

filepath = validation_path + "validation_data_high_percentage.csv"
index_frame.to_csv(filepath, index=False)

The previously part created motifs have all the same length. However, the data in our approach can handle different length motifs as well.
We use the work in doi.org/10.1109/ACCESS.2023.3295995 to identif such motifs.
The logs created here contain different length sized motifs, resembling different length user tasks.

In [None]:
data = {'Filename': [], 'Index': [], 'CaseOrder': [], 'CaseLength':[]}
index_frame = pd.DataFrame(data)
for rand in randomness:
    for occ in occurances:
        for l in lengthLog: # do not name len as shortcut, cause trouble with inbuild length function len()
            uiLog = get_rand_uiLog(dfAll, n_max=rand, actions=l) # set to actions=l for proper work
            mot = len(lengthMotifs)
            random_cases_list = get_random_values(dfAll, concept_name_column, mot, min_len=max(lengthMotifs))
            filtered_df = dfAll[dfAll[concept_name_column].isin(random_cases_list)]

            # Reduce the cases in length and append again
            strCaseLength = ""
            for i, element in enumerate(lengthMotifs):
                insert_df = filtered_df[filtered_df[concept_name_column] == random_cases_list[i]].sort_values(timeStampCol)
                try:
                    variableLengthDf = pd.concat([insert_df.iloc[:element], variableLengthDf], ignore_index=True)
                except NameError:
                    variableLengthDf = insert_df.iloc[:element-1]
                strCaseLength = strCaseLength + f"{random_cases_list[i]}:{element}/"

            uiLog, indices, random_cases_list = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                        uiLog=uiLog,
                                                        dfcases=variableLengthDf,
                                                        occurances=occ,
                                                        case_column_name=concept_name_column,
                                                        sorted_insert_col=timeStampCol,
                                                        shuffled=True,
                                                        shuffled_by=shuffle,
                                                        reduced=True,
                                                        reduced_by=reduce)
        
            filename = f"VarLenLog_{rand}_{mot}_{occ}_{l}_{shuffle}_{reduce}.csv"
            filepath = validation_path + filename
            uiLog.to_csv(filepath, index=False)

            # For tracking purpose and validation: Store the index of the cases in each log
            row = (filename, str(indices), str(random_cases_list), strCaseLength) 
            new_row_series = pd.Series(row, index=index_frame.columns)
            index_frame = pd.concat([index_frame, new_row_series.to_frame().T], ignore_index=True)

filepath = validation_path + "var_len_validation_data.csv"
index_frame.to_csv(filepath, index=False)

### Sample Data based on percentage instead of occurance

In [6]:
validationDataColumns = ["uiLogName","variationPercentage","numberOfOccurrancesToBeDiscovered","motifLength","percentageMotifsOverLog","logLength",
                     "motifSpots","caseIds"]
validationDataDF = pd.DataFrame(columns=validationDataColumns)
validation_path = csvPath + "percentageComparison/"

randomness = [1] # how long should the original action sequences be
motifs = [1] # how many different motifs should be inserted into the log
occurances = [10,15,20,30,60] #motifs count of the motif occurances in the log
percentageMotifsOverLog = [1,2.5,5,10]

# ToDo or just take the original size of the case to make it more real world
lengthMotifs = [5,10,15,20,25] # Could be added to enter different length motifs into the data


for mot in motifs:
    for rand in randomness:
        for occ in occurances:
            for percentage in percentageMotifsOverLog:
                for motifLen in lengthMotifs:
                    # Generate a filename and store the file
                    
                    # Calculate the length of the log based on the percentage values
                    l = ((occ*motifLen)/percentage*100)
                    samplingLength = l-(occ*motifLen)

                    if percentage == 2.5:
                        filename = f"LenLog_{rand}_{mot}_{occ}_{motifLen}_2-5_{int(l)}.csv"
                    else:
                        filename = f"LenLog_{rand}_{mot}_{occ}_{motifLen}_{percentage}_{int(l)}.csv"
                    
                    # Sample the UI log from all available data
                    uiLog = get_rand_uiLog(dfAll, n_max=rand, actions=int(samplingLength)) # set to actions=l for proper work
                    random_cases_list = get_random_values(dfAll, concept_name_column, mot, min_len=max(lengthMotifs))
                    filtered_df = dfAll[dfAll[concept_name_column].isin(random_cases_list)]
                    filtered_df = filtered_df.iloc[:motifLen]
                    
                    uiLog, indices, random_cases_list = insert_motifs_non_overlap(random_cases_list=random_cases_list,
                                                        uiLog=uiLog,
                                                        dfcases=filtered_df,
                                                        occurances=occ,
                                                        case_column_name=concept_name_column,
                                                        sorted_insert_col=timeStampCol,
                                                        shuffled=True,
                                                        shuffled_by=shuffle,
                                                        reduced=True,
                                                        reduced_by=reduce)
                    
                    
                   
                    new_row = {'uiLogName': filename, "variationPercentage":  shuffle, "motifsToBeDiscovered": mot, "numberOfOccurrancesToBeDiscovered": occ,
                    "motifLength": motifLen, "percentageMotifsOverLog": percentage, "logLength": l, "motifSpots": indices, "caseIds": random_cases_list}
                    print(filename)
                    filepath = validation_path + filename
                    uiLog.to_csv(filepath, index=False)
                    
                    validationDataDF = validationDataDF._append(new_row, ignore_index=True)


validationDataDF.to_csv(validation_path + "validationDataPercentage.csv")

LenLog_1_1_10_5_1_5000.csv


  validationDataDF = validationDataDF._append(new_row, ignore_index=True)


LenLog_1_1_10_10_1_10000.csv
LenLog_1_1_10_15_1_15000.csv
LenLog_1_1_10_20_1_20000.csv
LenLog_1_1_10_25_1_25000.csv
LenLog_1_1_10_5_2-5_2000.csv
LenLog_1_1_10_10_2-5_4000.csv
LenLog_1_1_10_15_2-5_6000.csv
LenLog_1_1_10_20_2-5_8000.csv
LenLog_1_1_10_25_2-5_10000.csv
LenLog_1_1_10_5_5_1000.csv
LenLog_1_1_10_10_5_2000.csv
LenLog_1_1_10_15_5_3000.csv
LenLog_1_1_10_20_5_4000.csv
LenLog_1_1_10_25_5_5000.csv
LenLog_1_1_10_5_10_500.csv
LenLog_1_1_10_10_10_1000.csv
LenLog_1_1_10_15_10_1500.csv
LenLog_1_1_10_20_10_2000.csv
LenLog_1_1_10_25_10_2500.csv
LenLog_1_1_15_5_1_7500.csv
LenLog_1_1_15_10_1_15000.csv
LenLog_1_1_15_15_1_22500.csv
LenLog_1_1_15_20_1_30000.csv
LenLog_1_1_15_25_1_37500.csv
LenLog_1_1_15_5_2-5_3000.csv
LenLog_1_1_15_10_2-5_6000.csv
LenLog_1_1_15_15_2-5_9000.csv
LenLog_1_1_15_20_2-5_12000.csv
LenLog_1_1_15_25_2-5_15000.csv
LenLog_1_1_15_5_5_1500.csv
LenLog_1_1_15_10_5_3000.csv
LenLog_1_1_15_15_5_4500.csv
LenLog_1_1_15_20_5_6000.csv
LenLog_1_1_15_25_5_7500.csv
LenLog_1_1_15_5_10_