In [2]:
import sys
sys.path.append('../') # To import from parent dir
import os

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import util.util as util

import datetime

### Sample Data based on percentage instead of occurance

Set the parameters in the following cell to your desired output. The parameters are described by the comment afterwards.
Do not change the parameters after the indication.

Execute the following two cells to generate the validation logs based on the SmartRPA validation logs in the folder "logs/SmartRPA/".
The data will be available after the complete exection. Particularly, the file containing all indexes and parameters will only be available if all loops were executed.

1. Get unique actions
2. Calculate the log size based on
    1. Number of Motifs
    2. Length of Motifs 
    3. Occurance per Motifs
    3. Percentage over Log
3. Get randomized log in size to fit motifs into the data (log_wo_motif)
    1. Sample sequences between 1-motif length until size is reached
4. Generate number of motifs motifs with size length
5. Generate random insert values between 0 and len(log_wo_motif)
6. Inset the motifs from back to front
    1. Shuffle the template if shuffled
    2. Add it at index with log_wo_motf[0:index] template + [index:len(log_wo_motif)]
    3. Store insert value in df
    4. increment all inserted for ground truth data
7. Store Log
8. Add information to ground_truth data

In [3]:
different_motif_count = [5,3,2,1] # Different motif types
lengths_count = [5,10,20,50] # Different lengths of motifs
occurances_count = [5,10,20] # Different number of occurances of motifs
percentageMotif_counts = [1,10,25,50,75,100] # Different percentage of motif in the time series
percentageShuffle_counts = [0,0.1,0.2] # Different percentage of shuffle of actions in the motif

# ---- Set the Data Path for the csv files used for the data sampling and where the logs should be added ----
csvPath = "../logs/smartRPA/OriginalAgostinelliLogs/"
validation_path = "../logs/smartRPA/202511-update/"

if not os.path.exists(validation_path):
    os.makedirs(validation_path)
    
# DO NOT Change from here for synthetic data!
# ---- Columns to generate the validation data for the experiment ----
validationDataColumns = ["uiLogName","percentageMotifsOverLog","logLength","noOfMotifs"]
i = 0
while i < max(different_motif_count):
    validationDataColumns.append(f"motif{i}-length")
    validationDataColumns.append(f"motif{i}-occurances")
    validationDataColumns.append(f"motif{i}-shuffle")
    validationDataColumns.append(f"motif{i}-startIndexes")
    validationDataColumns.append(f"motif{i}-caseID")
    i+=1

validationDataDF = pd.DataFrame(columns=validationDataColumns)
concept_name_column = 'case:concept:name'
timeStampCol = "time:timestamp"

# ---- Gathering of unique events until the upper limit is reached ----
dfAll = util.read_csvs_and_combine(csvPath,250000)
# Drop duplicates based on equality assumption in https://doi.org/10.1016/j.compind.2022.103721          
subset=["category","application","concept:name","event_src_path","event_dest_path","browser_url","xpath"]
df_unique = dfAll.drop_duplicates(subset=subset)
print(f"There are {df_unique.shape[0]} unique events in the dataframe.")

Maximum row limit of 250000 reached. Stopping reading additional files.
There are 7096 unique events in the dataframe.


In [4]:
total_loops = (
    len(different_motif_count)
    * len(occurances_count)
    * len(lengths_count)
    * len(percentageMotif_counts)
    * len(percentageShuffle_counts)
)
loop_counter = 0
start_time = datetime.datetime.now()
for motifs in different_motif_count:
    for occurances in occurances_count:
        for length in lengths_count:
            print("--------------------------------------------------")
            timeForPreparing = datetime.datetime.now()
            # Excluded from code as this is the most time consuming part and only needs to be done a few times
            random_cases_list = util.get_random_values(dfAll, concept_name_column, motifs, min_len=max(lengths_count))
            only_case_actions_df = dfAll[dfAll[concept_name_column].isin(random_cases_list[concept_name_column])]
            only_case_actions_df = only_case_actions_df.sort_values(by=[concept_name_column,timeStampCol])
            max_action_count = motifs * max(lengths_count) * occurances * 100 / min(percentageMotif_counts)
            print(f"Max action count for random log generation: {max_action_count}")
            randomDF = util.get_rand_uiLog(dfAll, n_max= max(lengths_count) // 2, action_count=max_action_count, printing=True)
            for percentageMotif in percentageMotif_counts:
                for shuffle in percentageShuffle_counts:
                    timeForGenerating = datetime.datetime.now()
                    loop_counter += 1
                    util.print_progress_bar(loop_counter, total_loops)
                    print(f"Processing: {motifs}-{different_motif_count} | {occurances}-{occurances_count} | {length}-{lengths_count} | {percentageMotif}-{percentageMotif_counts} | {shuffle}-{percentageShuffle_counts}")
                    new_row = util.generate_log_from_data_seed(dfAll, 
                                                            randomDF,
                                                            validation_path,
                                                            concept_name_column,
                                                            only_case_actions_df,
                                                            random_cases_list,
                                                            motifs,
                                                            occurances,
                                                            length,
                                                            percentageMotif,
                                                            shuffle)
                    validationDataDF = validationDataDF._append(new_row, ignore_index=True)
                    # Saftey Store
                    validationDataDF.to_csv(os.path.join(validation_path, "validationLogInformation.csv"), index=False)
                    print(f"Time taken for generating log: {datetime.datetime.now() - timeForGenerating}")
        print(f"Time taken for preparing data for motif count {motifs} and occurance count {occurances}: {datetime.datetime.now() - timeForPreparing} (Overall time: {datetime.datetime.now() - start_time})")
# Save the validation data log information
validationDataDF.to_csv(os.path.join(validation_path, "validationLogInformation.csv"), index=False)

--------------------------------------------------
Max action count for random log generation: 125000.0
Current generated UiLog length: 0
Current generated UiLog length: 500
Current generated UiLog length: 1100
Current generated UiLog length: 1200
Current generated UiLog length: 3100
Current generated UiLog length: 3200
Current generated UiLog length: 4000
Current generated UiLog length: 4300
Current generated UiLog length: 5100
Current generated UiLog length: 5200
Current generated UiLog length: 5300
Current generated UiLog length: 5700
Current generated UiLog length: 5800
Current generated UiLog length: 6200
Current generated UiLog length: 6700
Current generated UiLog length: 7500
Current generated UiLog length: 7700
Current generated UiLog length: 8200
Current generated UiLog length: 8300
Current generated UiLog length: 8500
Current generated UiLog length: 9400
Current generated UiLog length: 9700
Current generated UiLog length: 9800
Current generated UiLog length: 10300
Current gen