# Notebook to generate an extended UI Log

Based on the Student Record (SR) and Reimbursement (RT) logs from Leno et al., this notebook generates an extended version of the log.

The logs are in the folder called "Leno".

Properties of the original logs:
1. SR_RT_joint: Containing all Student Record traces and afterwards all Reimbursement traces.
2. ST_RT_paarallel: Contains all Student Records traces alternating with all Reimbursement traces.

Gathering of the original Leno data from https://figshare.com/articles/dataset/UI_logs/12543587

Properties of the Extended Logs:
1. Extended_SR_RT_joint: Between all traces there are X randomly generated events. X can be set in this notebook.
2. Extended_ST_RT_parallel: Same as for Extended_SR_RT_joint.

In [2]:
import sys
sys.path.append('../') # To import from parent dir

import random

from datetime import datetime

from util.util import read_data_for_processing

import pandas as pd
import numpy as np

file_path = '../logs/Leno/'
srrt_plus_filename = "SR_RT_plus"
srrt_parallel_filename = "SR_RT_parallel"

list_of_logs = [srrt_plus_filename, srrt_parallel_filename,]

text_encoding_method = "utf-8"
seperator = ";"

  from .autonotebook import tqdm as notebook_tqdm


#### Execution of Log Generation for Discovery

Generating two extended logs
1. Adding a case id for all existing cases
2. Adding 50 random actions between all cases to simulate long time recording

In [39]:
def generate_random_rows(df: pd.DataFrame, num_rows: int) -> list[pd.DataFrame]:
    """
    Generates a list of random rows from existing DataFrame values, and adds random data for URL,
    content, and target.workbookName columns.
    """
    columns_to_shuffle = [col for col in df.columns if col != "caseid"]
    
    # Predefined lists for random data
    random_actions = [
        "copyCell", "paste", "editField", "clickButton", "clickLink", "selectWorksheet",
        "copyRange", "form_submit", "createNewTab"
    ]

    random_urls = [
        "https://example.com", "https://example.org", "https://example.net", "", " ", 
        "https://randomsite.com", "https://testsite.com", "https://anotherurl.com", 
        "https://sap.example.com", "https://service-now.example.com", "https://salesforce.example.com", 
        "https://jira.example.com", "https://confluence.example.com", "https://microsoft.com", 
        "https://office365.example.com", "https://slack.com", "https://zoom.us", 
        "https://google.com", "https://github.com", "https://linkedin.com", 
        "https://workday.example.com", "https://oracle.com", "https://adobe.com"
    ]
    random_content = [
        "Lorem ipsum dolor sit amet.", "This is a random content example.", 
        "Random text for data generation.", "", " ", "Sample content for testing.", 
        "Another random string.", "Test content for the application.", 
        "Placeholder text for demonstration.", "Randomly generated content."
    ]
    random_file_names = [
        "report_final.xlsx", "data_analysis.csv", "project_notes.pdf", "budget_2023.xlsx",
        "presentation.csv", "meeting_minutes.xlsx", "research_paper.xml"
    ]
    random_cell_names = [
        "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "J10", "K11", "L12",
        "M13", "N14", "O15", "P16", "Q17", "R18", "S19", "T20", "U21", "V22", "W23",
        "X24", "Y25", "Z26", "AA27", "AB28", "AC29", "AD30", "AE31", "AF32", "AG33",
    ]
    random_html_tags = [
        "div", "span", "p", "a", "img", "h5", "h6", "ul", "ol", "li",
        "table", "form", "input", "button", "select", "option", "textarea", "label"
    ]
    random_sheet_names = [
        "Sheet1", "Sheet2", "Sheet3", "Sheet4", "Sheet5", "Sheet6", "Sheet7"
    ]
    
    rows = []

    for _ in range(num_rows):
        random_row = {col: random.choice(df[col].tolist()) for col in columns_to_shuffle}

        # fixed values
        random_row["eventType"] = random.choice(random_actions)
        random_row["content"] = random.choice(random_content)

        if random_row["targetApp"] == "Chrome":
            random_row["target.sheetName"] = ""
            random_row["target.workbookName"] = ""
            random_row["url"] = random.choice(random_urls)
            random_row["target.id"] = random.choice(random_html_tags)

        elif random_row["targetApp"] == "Excel":
            random_row["target.id"] = random.choice(random_cell_names)
            random_row["target.workbookName"] = random.choice(random_file_names)
            random_row["target.sheetName"] = random.choice(random_sheet_names)
            random_row["target.tagName"] = ""
            random_row["url"] = ""
            random_row["target.type"] = ""
            random_row["target.href"] = ""

        rows.append(random_row)
                                            
    return pd.DataFrame(rows)
                                        

### Ground Truth Calculation for unextended Log

Due to incomplete routines: The last SR and the first RT Routine do not match in the PLUS log. Have to be adjusted manually as automating this is not feasible.
Same for the SRRT Parallel Log: The last ground truth values have to be adjusted to match the original data.

In [40]:
for log_filename in list_of_logs:
    df = pd.read_csv(file_path + log_filename + ".csv", encoding=text_encoding_method, sep=seperator)
    df["caseid"] = -1
    cond = (df["url"] == "https://forms.zoho.com/universityofmelbourne/form/NewRecord/thankyou") | ((df["url"] == "https://submit.jotform.com/submit/200477494954062/") & (df["eventType"] == "clickLink"))
    insert_positions = df.index[cond].tolist()
    blocks = []
    og_block_nr = 1
    current = 0
    current_out_index = 0
    gt_df = pd.DataFrame(columns=["caseid","start_index","length","motif"])
    for pos in insert_positions:
        length = pos - current
        motif = "RT" if length > 40 else "SR"
        gt_new_row = {
            "caseid": og_block_nr,
            "start_index": current_out_index,
            "length": length,
            "motif": motif
        }
        gt_df = pd.concat([gt_df, pd.DataFrame([gt_new_row])], ignore_index=True)
        og_block_nr += 1
        if current_out_index == 0:
            current_out_index += length +1 
        else:
            current_out_index += length
        current = pos

    # Adding Case IDs >> Important >> Some manual changes will be necessary after this step to ensure correctness, because the rule under cond does not guarantee perfect splits.
    df["caseid"] = 0
    for i,gt in gt_df.iterrows():
        start = gt["start_index"]
        df.loc[start:, "caseid"] = i + 1
    
    df.to_csv(file_path + "202511_" + log_filename + ".csv", index=False, sep=seperator, encoding=text_encoding_method)
    gt_df.to_csv(file_path + "202511_" + log_filename + "_ground_truth.csv", index=False, sep=seperator, encoding=text_encoding_method)

### Creation of extended Log and Calculation of Ground Truth

In [41]:
for log_filename in list_of_logs:
    df = pd.read_csv(file_path + log_filename + ".csv", encoding=text_encoding_method, sep=seperator)
    df["caseid"] = -1
    cond = (df["url"] == "https://forms.zoho.com/universityofmelbourne/form/NewRecord/thankyou") | ((df["url"] == "https://submit.jotform.com/submit/200477494954062/") & (df["eventType"] == "clickLink"))
    insert_positions = df.index[cond].tolist()
    blocks = []
    og_block_nr = 1
    current = 0
    current_out_index = 0
    num_of_inserts = 50
    gt_df = pd.DataFrame(columns=["caseid","start_index","length","motif"])
    for pos in insert_positions:
        block = df.iloc[current:pos+1].copy()
        block["caseid"] = og_block_nr
        blocks.append(block)

        rand = generate_random_rows(df, num_of_inserts).copy()
        blocks.append(rand)

        length = pos - current +1
        motif = "RT" if length > 40 else "SR"

        gt_new_row = {
            "caseid": og_block_nr,
            "start_index": current_out_index,
            "length": length,
            "motif": motif
        }

        gt_df = pd.concat([gt_df, pd.DataFrame([gt_new_row])], ignore_index=True)

        og_block_nr += 1
        current_out_index += length + num_of_inserts
        current = pos+1

    blocks.append(df.iloc[current:])  # tail

    out = pd.concat(blocks, ignore_index=True)
    out.to_csv(file_path + "202511_" + log_filename + "_extended.csv", index=False, sep=seperator, encoding=text_encoding_method)
    gt_df.to_csv(file_path + "202511_" + log_filename + "_extended_ground_truth.csv", index=False, sep=seperator, encoding=text_encoding_method)

### Annotate the rows with caseId, Zeros for Noise and Task Name

In [None]:
df_plus = pd.read_csv(file_path + "202511_SR_RT_plus.csv", sep=";")
df_parallel = pd.read_csv(file_path + "202511_SR_RT_parallel.csv", sep=";")
df_plus_ext = pd.read_csv(file_path +"202511_SR_RT_plus_extended.csv", sep=";")
df_parallel_ext = pd.read_csv(file_path + "202511_SR_RT_parallel_extended.csv", sep=";")

df_plus["Task"] = np.nan
df_parallel["Task"] = np.nan
df_plus.loc[df_plus["caseid"]  >= 1, "Task"] = "StudentRecord"
df_plus.loc[df_plus["caseid"] >50 , "Task"] = "Reimbursement"
df_plus.loc[df_plus["caseid"] == 0, "Task"] = ""
df_plus.rename(columns={"caseid":"idx"}, inplace=True)
df_plus.to_csv(file_path + "202511_SR_RT_plus_labeled4Rebmann.csv", sep=",", index=False)

df_parallel.loc[df_parallel["caseid"] % 2 >= 1, "Task"] = "StudentRecord"
df_parallel.loc[df_parallel["Task"].isna() , "Task"] = "Reimbursement"
df_parallel.loc[df_parallel["caseid"].isna(), "Task"] = ""
df_parallel.rename(columns={"caseid":"idx"}, inplace=True)
df_parallel.to_csv(file_path + "202511_SR_RT_parallel_labeled4Rebmann.csv", sep=",", index=False)

df_plus_ext["Task"] = ""
df_parallel_ext["Task"] = ""
df_plus_ext.loc[df_plus_ext["caseid"] >= 1, "Task"] = "StudentRecord"
df_plus_ext.loc[df_plus_ext["caseid"] >50 , "Task"] = "Reimbursement"
df_plus_ext.loc[df_plus_ext["caseid"].isna(), "caseid"] = 0
df_plus_ext.loc[df_plus_ext["caseid"] == 0, "Task"] = "Noise"
df_plus_ext.rename(columns={"caseid":"idx"}, inplace=True)
df_plus_ext["idx"] = pd.to_numeric(df_plus_ext["idx"])
df_plus_ext.to_csv(file_path + "202511_SR_RT_plus_extended_labeled4Rebmann.csv", sep=",", index=False)

df_parallel_ext.loc[df_parallel_ext["caseid"] % 2 >= 1, "Task"] = "StudentRecord"
df_parallel_ext.loc[df_parallel_ext["Task"].isna() , "Task"] = "Reimbursement"
df_parallel_ext.loc[df_parallel_ext["caseid"].isna(), "caseid"] = 0
df_parallel_ext.loc[df_parallel_ext["caseid"] == 0, "Task"] = "Noise"
df_parallel_ext.rename(columns={"caseid":"idx"}, inplace=True)
df_parallel_ext.to_csv(file_path + "202511_SR_RT_parallel_extended_labeled4Rebmann.csv", sep=",", index=False)

  df_plus.loc[df_plus["caseid"]  >= 1, "Task"] = "StudentRecord"
  df_parallel.loc[df_parallel["caseid"] % 2 >= 1, "Task"] = "StudentRecord"
  df_plus_ext.loc[df_plus_ext["caseid"]  >= 1, "Task"] = "StudentRecord"
  df_parallel_ext.loc[df_parallel_ext["caseid"] % 2 >= 1, "Task"] = "StudentRecord"


### Create Ground Truth for Leno Exec

In [50]:
# Settings

# Folder path in which the UI logs are which should be transformed into RPM Segmentor ground truth or SmartRPA segmentor files
folder_path = "../logs/Leno/"

# Settings for Leno Logs generated for experiment
logs = ["202511_SR_RT_plus.csv","202511_SR_RT_parallel.csv","202511_SR_RT_plus_extended.csv","202511_SR_RT_parallel_extended.csv"]
gts = ["202511_SR_RT_plus_ground_truth.csv","202511_SR_RT_parallel_ground_truth.csv","202511_SR_RT_plus_extended_ground_truth.csv","202511_SR_RT_parallel_extended_ground_truth.csv"]

seperator = ";" # "," for SmartRPA, ";" for Tockler/AWT
encoding_method = "utf-8" # UTF-8 for SmartRPA, latin-1 for Tockler/AWT

# Read Files 1. Log 2. Validation Data to identify patterns
for i,log in enumerate(logs):
    print(log)
    log_filename = log
    log = pd.read_csv(folder_path + log_filename, encoding=encoding_method, sep=seperator)
    validation_data = pd.read_csv(folder_path + gts[i], encoding=encoding_method, sep=seperator)
    # Get Index of Motifs

    motifLength = int(validation_data["length"].iloc[0])

    motifSpots = validation_data["start_index"].tolist()
    caseAtSpot = validation_data["caseid"].tolist()
    motifAtSpot = validation_data["motif"].tolist()

    # Generate Filename
    grundTruth_File = log_filename.split(".")[0] + ".txt"
    pattern_complete = ""
    unique_patterns_set = set()
    for row in validation_data.itertuples():
        pattern = ""
        for _, row in log.iloc[row[2]:row[2] + row[3]].iterrows():
            activityPattern = f"{row['eventType']}".strip()
            pattern = pattern + activityPattern.replace(" ","") + " -1 "

        # ----------------------------------------------------
        # Add this line here to clean the final pattern string
        pattern = pattern.strip()
        # ----------------------------------------------------

        # Add only unique patterns
        if pattern not in unique_patterns_set:
            unique_patterns_set.add(pattern)
            pattern_complete = pattern_complete + pattern + " -2\n"
        else:
            print("Duplicate Found, skipping...")


    with open(folder_path + log_filename.split(".")[0] + ".txt", 'w') as f:
        f.write(pattern_complete)
    

202511_SR_RT_plus.csv
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skipping...
Duplicate Found, skip

### Log Conversion for Agostinelli

In [None]:
logs = ["202511_SR_RT_plus.csv","202511_SR_RT_parallel.csv","202511_SR_RT_plus_extended.csv","202511_SR_RT_parallel_extended.csv"]
gts = ["202511_SR_RT_plus_ground_truth.csv","202511_SR_RT_parallel_ground_truth.csv","202511_SR_RT_plus_extended_ground_truth.csv","202511_SR_RT_parallel_extended_ground_truth.csv"]
folder_path = "../logs/Leno/"

for i,log in enumerate(logs):
     leno_file_name = logs[i]
     leno_gt_file_name = gts[i]
     data_for_processing = read_data_for_processing(isSmartRPA2024=False,
                                               isSmartRPA2025=False,
                                                    isRealWorldTest=False,
                                                    isActionLogger=True,
                                                    leno_file_name=leno_file_name,
                                                    leno_gt_file_name=leno_gt_file_name,
                                                    isHCI=False,
                                                    log_name_smartRPA="",
                                                    encoding_method=1)

     # Clear existing file
     file_location = folder_path + leno_file_name.split(".")[0] + "_4_agostinelli"
     open(file_location, 'w').close()
     # Unpack the returned dictionary
     log = data_for_processing["log"]
     ground_truth = data_for_processing["ground_truth"]

     # Create Log File
     timestamp = "2025-12-14 10:00:10+01:00"
     end_index_set = set(ground_truth["end_index"])
     start_minus_one_set = set(ground_truth["start_index"]-1)
     end_delimiter = timestamp + " x x ON x\n"
     for i, row in log.iterrows():
          activity = row["symbol"]
          pattern_complete = timestamp + " " + activity + " " + activity + " ON Other_Activity\n"
          with open(file_location, 'a') as f:
               f.write(pattern_complete)
          if i in end_index_set:
               with open(file_location, 'a') as f:
                    f.write(end_delimiter)
          # No Delimiter after noise required > if delimiter after noise necessary add the following code
          # if "extended" in file_location and i in start_minus_one_set:
          #      with open(file_location, 'a') as f:
          #           f.write(end_delimiter)
               

     # Create Config File
     config_file_location = folder_path + leno_file_name.split(".")[0] + "_4_agostinelli.config"
     sorted_List = list(set(log["symbol"]))
     sorted_List.sort()
     print(sorted_List)
     res = ' '.join(sorted_List)
     with open(config_file_location, 'w')as f:
          f.write("sensor " + res + "\ndata UILog.config\nmodel model\nnumiterations -1")

Processing file: 202511_SR_RT_plus.csv with 4646 events.
Using Word2Vec based encoding for UI Log
['A', 'AA', 'AAA', 'AAB', 'AAC', 'AAD', 'AAE', 'AAF', 'AAG', 'AAH', 'AAI', 'AAJ', 'AAK', 'AAL', 'AAM', 'AAN', 'AAO', 'AAP', 'AAQ', 'AAR', 'AAS', 'AAT', 'AAU', 'AAV', 'AAW', 'AAX', 'AAY', 'AAZ', 'AB', 'ABA', 'ABB', 'ABC', 'ABD', 'ABE', 'ABF', 'ABG', 'ABH', 'ABI', 'ABJ', 'ABK', 'ABL', 'ABM', 'ABN', 'ABO', 'ABP', 'ABQ', 'ABR', 'ABS', 'ABT', 'ABU', 'ABV', 'ABW', 'ABX', 'ABY', 'ABZ', 'AC', 'ACA', 'ACB', 'ACC', 'ACD', 'ACE', 'ACF', 'ACG', 'ACH', 'ACI', 'ACJ', 'ACK', 'ACL', 'ACM', 'ACN', 'ACO', 'ACP', 'ACQ', 'ACR', 'ACS', 'ACT', 'ACU', 'ACV', 'ACW', 'ACX', 'ACY', 'ACZ', 'AD', 'ADA', 'ADB', 'ADC', 'ADD', 'ADE', 'ADF', 'ADG', 'ADH', 'ADI', 'ADJ', 'ADK', 'ADL', 'ADM', 'ADN', 'ADO', 'ADP', 'ADQ', 'ADR', 'ADS', 'ADT', 'ADU', 'ADV', 'ADW', 'ADX', 'ADY', 'ADZ', 'AE', 'AEA', 'AEB', 'AEC', 'AED', 'AEE', 'AEF', 'AEG', 'AEH', 'AEI', 'AEJ', 'AEK', 'AEL', 'AEM', 'AEN', 'AEO', 'AEP', 'AEQ', 'AER', 'AES', 'AET'