# Notebook to generate an extended UI Log

Based on the Student Record (SR) and Reimbursement (RT) logs from Leno et al., this notebook generates an extended version of the log.

The logs are in the folder called "Leno".

Properties of the original logs:
1. SR_RT_joint: Containing all Student Record traces and afterwards all Reimbursement traces.
2. ST_RT_paarallel: Contains all Student Records traces alternating with all Reimbursement traces.

Gathering of the original Leno data from https://figshare.com/articles/dataset/UI_logs/12543587

Properties of the Extended Logs:
1. Extended_SR_RT_joint: Between all traces there are X randomly generated events. X can be set in this notebook.
2. Extended_ST_RT_parallel: Same as for Extended_SR_RT_joint.

In [1]:
import random

from datetime import datetime

import pandas as pd
import numpy as np

file_path = 'Leno/'
srrt_plus_filename = "SR_RT_joint.csv"
srrt_parallel_filename = "SR_RT_parallel.csv"

text_encoding_method = "utf-8"
seperator = ";"

#### Execution of Log Generation for Discovery

Generating two extended logs
1. Adding a case id for all existing cases
2. Adding 50 random actions between all cases to simulate long time recording

In [None]:
def generate_random_rows(df: pd.DataFrame, num_rows: int) -> list[pd.DataFrame]:
    """
    Generates a list of random rows from existing DataFrame values, and adds random data for URL,
    content, and target.workbookName columns.
    """
    shuffled_rows: list[pd.DataFrame] = []
    columns_to_shuffle = [col for col in df.columns if col != "caseid"]
    
    # Predefined lists for random data
    random_actions = [
        "copyCell", "paste", "editField", "clickButton", "clickLink", "selectWorksheet",
        "copyRange", "form_submit", "createNewTab"
    ]

    random_urls = [
        "https://example.com", "https://example.org", "https://example.net", "", " ", 
        "https://randomsite.com", "https://testsite.com", "https://anotherurl.com", 
        "https://sap.example.com", "https://service-now.example.com", "https://salesforce.example.com", 
        "https://jira.example.com", "https://confluence.example.com", "https://microsoft.com", 
        "https://office365.example.com", "https://slack.com", "https://zoom.us", 
        "https://google.com", "https://github.com", "https://linkedin.com", 
        "https://workday.example.com", "https://oracle.com", "https://adobe.com"
    ]
    random_content = [
        "Lorem ipsum dolor sit amet.", "This is a random content example.", 
        "Random text for data generation.", "", " ", "Sample content for testing.", 
        "Another random string.", "Test content for the application.", 
        "Placeholder text for demonstration.", "Randomly generated content."
    ]
    random_file_names = [
        "report_final.xlsx", "data_analysis.csv", "project_notes.pdf", "budget_2023.xlsx",
        "presentation.csv", "meeting_minutes.xlsx", "research_paper.xml"
    ]
    random_cell_names = [
        "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "J10", "K11", "L12",
        "M13", "N14", "O15", "P16", "Q17", "R18", "S19", "T20", "U21", "V22", "W23",
        "X24", "Y25", "Z26", "AA27", "AB28", "AC29", "AD30", "AE31", "AF32", "AG33",
    ]
    random_html_tags = [
        "div", "span", "p", "a", "img", "h5", "h6", "ul", "ol", "li",
        "table", "form", "input", "button", "select", "option", "textarea", "label"
    ]
    random_sheet_names = [
        "Sheet1", "Sheet2", "Sheet3", "Sheet4", "Sheet5", "Sheet6", "Sheet7"
    ]
    
    for _ in range(num_rows):
        # Shuffle other columns' values
        random_row: dict = {col: random.choice(df[col].tolist()) for col in columns_to_shuffle}
        
        # Add random values for specific columns
        random_row["eventType"] = random.choice(random_actions)  # Random URL
        random_row["content"] = random.choice(random_content)  # Random personal content
        
        if random_row["targetApp"] == "Chrome":
            random_row["target.sheetName"] = ""  # Empty for non-Excel apps
            random_row["target.workbookName"] = ""  # Empty for non-Excel apps
            random_row["url"] = random.choice(random_urls)  # Random URL
            random_row["target.id"] = random.choice(random_html_tags)  # Random HTML tag
        if random_row["targetApp"] == "Excel":
            random_row["target.id"] = random.choice(random_cell_names)  # Random cell name
            random_row["target.workbookName"] = random.choice(random_file_names)  # Random file name
            random_row["target.sheetName"] = random.choice(random_sheet_names)  # Random sheet name
            random_row["target.tagName"] = "" # Empty for Excel
            random_row["url"] = "" # Empty for Excel
            random_row["target.type"] = "" # Empty for Excel
            random_row["target.href"] = ""
        
        shuffled_rows.append(pd.DataFrame([random_row])) 
                                            
    return shuffled_rows
                                            

# Ensure the DataFrames are empty before adding data
srrt_plus_log = pd.DataFrame()  
# srrt_parallel_log = pd.DataFrame()

# Read the original CSV Logs from Leno et al
srrt_plus_log = pd.read_csv(file_path + srrt_plus_filename, encoding=text_encoding_method, sep=seperator)
print("Lenght of SRRT+ Log: ", len(srrt_plus_log))
srrt_parallel_log = pd.read_csv(file_path + srrt_parallel_filename, encoding=text_encoding_method, sep=seperator)
print("Length of SRRT|| Log: ", len(srrt_parallel_log))

# Add Caseids for discovery and random noise
srrt_plus_log = add_caseid_column(srrt_plus_log,random_insert=True,num_rows=25)
srrt_parallel_log = add_caseid_column(srrt_parallel_log,random_insert=True,num_rows=25)
#print(srrt_plus_log)

# Storing Files
timestamp = datetime.now().strftime("%y%m%d_%H%M")
full_path_srrt_plus = file_path + f"{timestamp}_extended_" + srrt_plus_filename
full_path_srrt_parallel = file_path + f"{timestamp}_extended_" + srrt_parallel_filename
srrt_plus_log.to_csv(full_path_srrt_plus, index=False, sep=seperator) 
srrt_parallel_log.to_csv(full_path_srrt_parallel, index=False, sep=seperator)

print("Lenght of SRRT+ Log: ", len(srrt_plus_log))
print("Length of SRRT|| Log: ", len(srrt_parallel_log))

Lenght of SRRT+ Log:  4644
Length of SRRT|| Log:  4644


ValueError: If using all scalar values, you must pass an index