# Notebook to generate an extended UI Log

Based on the Student Record (SR) and Reimbursement (RT) logs from Leno et al., this notebook generates an extended version of the log.

The logs are in the folder called "Leno".

Properties of the original logs:
1. SR_RT_joint: Containing all Student Record traces and afterwards all Reimbursement traces.
2. ST_RT_paarallel: Contains all Student Records traces alternating with all Reimbursement traces.

Gathering of the original Leno data from https://figshare.com/articles/dataset/UI_logs/12543587

Properties of the Extended Logs:
1. Extended_SR_RT_joint: Between all traces there are X randomly generated events. X can be set in this notebook.
2. Extended_ST_RT_parallel: Same as for Extended_SR_RT_joint.

In [11]:
import random

from datetime import datetime

import pandas as pd
import numpy as np

file_path = '../logs/Leno/'
srrt_plus_filename = "SR_RT_plus.csv"
srrt_parallel_filename = "SR_RT_parallel.csv"

list_of_logs = [srrt_plus_filename, srrt_parallel_filename,]

text_encoding_method = "utf-8"
seperator = ";"

#### Execution of Log Generation for Discovery

Generating two extended logs
1. Adding a case id for all existing cases
2. Adding 50 random actions between all cases to simulate long time recording

In [12]:
def generate_random_rows(df: pd.DataFrame, num_rows: int) -> list[pd.DataFrame]:
    """
    Generates a list of random rows from existing DataFrame values, and adds random data for URL,
    content, and target.workbookName columns.
    """
    columns_to_shuffle = [col for col in df.columns if col != "caseid"]
    
    # Predefined lists for random data
    random_actions = [
        "copyCell", "paste", "editField", "clickButton", "clickLink", "selectWorksheet",
        "copyRange", "form_submit", "createNewTab"
    ]

    random_urls = [
        "https://example.com", "https://example.org", "https://example.net", "", " ", 
        "https://randomsite.com", "https://testsite.com", "https://anotherurl.com", 
        "https://sap.example.com", "https://service-now.example.com", "https://salesforce.example.com", 
        "https://jira.example.com", "https://confluence.example.com", "https://microsoft.com", 
        "https://office365.example.com", "https://slack.com", "https://zoom.us", 
        "https://google.com", "https://github.com", "https://linkedin.com", 
        "https://workday.example.com", "https://oracle.com", "https://adobe.com"
    ]
    random_content = [
        "Lorem ipsum dolor sit amet.", "This is a random content example.", 
        "Random text for data generation.", "", " ", "Sample content for testing.", 
        "Another random string.", "Test content for the application.", 
        "Placeholder text for demonstration.", "Randomly generated content."
    ]
    random_file_names = [
        "report_final.xlsx", "data_analysis.csv", "project_notes.pdf", "budget_2023.xlsx",
        "presentation.csv", "meeting_minutes.xlsx", "research_paper.xml"
    ]
    random_cell_names = [
        "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "J10", "K11", "L12",
        "M13", "N14", "O15", "P16", "Q17", "R18", "S19", "T20", "U21", "V22", "W23",
        "X24", "Y25", "Z26", "AA27", "AB28", "AC29", "AD30", "AE31", "AF32", "AG33",
    ]
    random_html_tags = [
        "div", "span", "p", "a", "img", "h5", "h6", "ul", "ol", "li",
        "table", "form", "input", "button", "select", "option", "textarea", "label"
    ]
    random_sheet_names = [
        "Sheet1", "Sheet2", "Sheet3", "Sheet4", "Sheet5", "Sheet6", "Sheet7"
    ]
    
    rows = []

    for _ in range(num_rows):
        random_row = {col: random.choice(df[col].tolist()) for col in columns_to_shuffle}

        # fixed values
        random_row["eventType"] = random.choice(random_actions)
        random_row["content"] = random.choice(random_content)

        if random_row["targetApp"] == "Chrome":
            random_row["target.sheetName"] = ""
            random_row["target.workbookName"] = ""
            random_row["url"] = random.choice(random_urls)
            random_row["target.id"] = random.choice(random_html_tags)

        elif random_row["targetApp"] == "Excel":
            random_row["target.id"] = random.choice(random_cell_names)
            random_row["target.workbookName"] = random.choice(random_file_names)
            random_row["target.sheetName"] = random.choice(random_sheet_names)
            random_row["target.tagName"] = ""
            random_row["url"] = ""
            random_row["target.type"] = ""
            random_row["target.href"] = ""

        rows.append(random_row)
                                            
    return pd.DataFrame(rows)
                                        

### Ground Truth Calculation for unextended Log

Due to incomplete routines: The last SR and the first RT Routine do not match in the PLUS log. Have to be adjusted manually as automating this is not feasible.
Same for the SRRT Parallel Log: The last ground truth values have to be adjusted to match the original data.

In [None]:
for log_filename in list_of_logs:
    df = pd.read_csv(file_path + log_filename, encoding=text_encoding_method, sep=seperator)
    df["caseid"] = -1
    cond = (df["url"] == "https://forms.zoho.com/universityofmelbourne/form/NewRecord/thankyou") | ((df["url"] == "https://submit.jotform.com/submit/200477494954062/") & (df["eventType"] == "clickLink"))
    insert_positions = df.index[cond].tolist()
    blocks = []
    og_block_nr = 1
    current = 0
    current_out_index = 0
    gt_df = pd.DataFrame(columns=["caseid","start_index","length","motif"])
    for pos in insert_positions:
        length = pos - current
        motif = "RT" if length > 40 else "SR"
        gt_new_row = {
            "caseid": og_block_nr,
            "start_index": current_out_index,
            "length": length,
            "motif": motif
        }
        gt_df = pd.concat([gt_df, pd.DataFrame([gt_new_row])], ignore_index=True)
        og_block_nr += 1
        if current_out_index == 0:
            current_out_index += length +1 
        else:
            current_out_index += length
        current = pos

    # Adding Case IDs >> Important >> Some manual changes will be necessary after this step to ensure correctness, because the rule under cond does not guarantee perfect splits.
    df["caseid"] = 0
    for i,gt in gt_df.iterrows():
        start = gt["start_index"]
        df.loc[start:, "caseid"] = i + 1
    
    df.to_csv(file_path + log_filename, index=False, sep=seperator, encoding=text_encoding_method)
    gt_df.to_csv(file_path + "202511_ground_truth_2_" + log_filename, index=False, sep=seperator, encoding=text_encoding_method)

### Creation of extended Log and Calculation of Ground Truth

In [51]:
for log_filename in list_of_logs:
    df = pd.read_csv(file_path + log_filename, encoding=text_encoding_method, sep=seperator)
    df["caseid"] = -1
    cond = (df["url"] == "https://forms.zoho.com/universityofmelbourne/form/NewRecord/thankyou") | ((df["url"] == "https://submit.jotform.com/submit/200477494954062/") & (df["eventType"] == "clickLink"))
    insert_positions = df.index[cond].tolist()
    blocks = []
    og_block_nr = 1
    current = 0
    current_out_index = 0
    num_of_inserts = 50
    gt_df = pd.DataFrame(columns=["caseid","start_index","length","motif"])
    for pos in insert_positions:
        block = df.iloc[current:pos].copy()
        block["caseid"] = og_block_nr
        blocks.append(block)

        rand = generate_random_rows(df, num_of_inserts).copy()
        blocks.append(rand)

        length = pos - current
        motif = "RT" if length > 40 else "SR"

        gt_new_row = {
            "caseid": og_block_nr,
            "start_index": current_out_index,
            "length": length,
            "motif": motif
        }

        gt_df = pd.concat([gt_df, pd.DataFrame([gt_new_row])], ignore_index=True)

        og_block_nr += 1
        current_out_index += length + num_of_inserts
        current = pos

    blocks.append(df.iloc[current:])  # tail

    out = pd.concat(blocks, ignore_index=True)
    out.to_csv(file_path + "202511_extended_" + log_filename, index=False, sep=seperator, encoding=text_encoding_method)
    gt_df.to_csv(file_path + "202511_ground_truth_extended_" + log_filename, index=False, sep=seperator, encoding=text_encoding_method)

In [48]:
gt_df

Unnamed: 0,caseid,start_index,length,motif
0,1,0,28,SR
1,2,78,61,RT
2,3,189,33,SR
3,4,272,60,RT
4,5,382,31,SR
...,...,...,...,...
95,96,9149,60,RT
96,97,9259,33,SR
97,98,9342,60,RT
98,99,9452,33,SR


In [50]:
out.iloc[75:85]

Unnamed: 0,timeStamp,userID,targetApp,eventType,url,content,target.workbookName,target.sheetName,target.id,target.class,...,target.type,target.name,target.value,target.innerText,target.checked,target.href,target.option,target.title,target.innerHTML,caseid
75,2020-02-25T02:39:26.072Z,vleno,Excel,editField,,Randomly generated content.,data_analysis.csv,Sheet1,F6,,...,,,82236031,,,,,,,
76,2019-10-21T00:39:10.985Z,vleno,Chrome,copyRange,https://service-now.example.com,Placeholder text for demonstration.,,,div,,...,text,,dwu@gmail.com,740,,,,,,
77,2019-10-21T01:02:07.450Z,vleno,Chrome,form_submit,https://office365.example.com,Test content for the application.,,,span,,...,text,q3_name[first],01-2247-00-5694-00010-B04-20-01,\n Next\n,,,,,,
78,2019-10-21T00:11:41.741Z,vleno,Chrome,clickLink,https://forms.zoho.com/universityofmelbourne/f...,,,,,,...,,,,Add another response.,,/universityofmelbourne/form/NewRecord,,,\n<em class=liveAddIcon flLeft></em>\n<b>Add ...,2.0
79,2019-10-21T00:15:15.277Z,vleno,Chrome,createNewTab,,,,,218,,...,,,,,,,,,,2.0
80,2019-10-21T00:12:04.330Z,vleno,Excel,copyCell,,John Doe,reimbursement.xlsx,Student details,A2,,...,,,John Doe,,,,,,,2.0
81,2019-10-21T00:12:09.464Z,vleno,Chrome,paste,https://form.jotform.com/200477494954062,John Doe,,,first_3,,...,text,q3_name[first],,,,,,,,2.0
82,2019-10-21T00:12:10.104Z,vleno,Chrome,editField,https://form.jotform.com/200477494954062,,,,first_3,,...,text,q3_name[first],John,,,,,,,2.0
83,2019-10-21T00:12:16.000Z,vleno,Chrome,paste,https://form.jotform.com/200477494954062,John Doe,,,last_3,,...,text,q3_name[last],,,,,,,,2.0
84,2019-10-21T00:12:19.240Z,vleno,Chrome,editField,https://form.jotform.com/200477494954062,,,,last_3,,...,text,q3_name[last],Doe,,,,,,,2.0
