# Notebook to generate an extended UI Log

Based on the Student Record (SR) and Reimbursement (RT) logs from Leno et al., this notebook generates an extended version of the log.

The logs are in the folder called "Leno".

Properties of the original logs:
1. SR_RT_joint: Containing all Student Record traces and afterwards all Reimbursement traces.
2. ST_RT_paarallel: Contains all Student Records traces alternating with all Reimbursement traces.

Gathering of the original Leno data from https://figshare.com/articles/dataset/UI_logs/12543587

Properties of the Extended Logs:
1. Extended_SR_RT_joint: Between all traces there are X randomly generated events. X can be set in this notebook.
2. Extended_ST_RT_parallel: Same as for Extended_SR_RT_joint.

In [2]:
import random

from datetime import datetime

import pandas as pd
import numpy as np

file_path = 'Leno/'
srrt_plus_filename = "SR_RT_joint.csv"
srrt_parallel_filename = "SR_RT_parallel.csv"

text_encoding_method = "utf-8"
seperator = ";"

#### Execution of Log Generation for Discovery

Generating two extended logs
1. Adding a case id for all existing cases
2. Adding 50 random actions between all cases to simulate long time recording

In [3]:
def add_caseid_column(df: pd.DataFrame, random_insert: bool = True, num_rows: int = 50) -> pd.DataFrame:
    """
    Adds a 'caseid' column to the DataFrame, incrementing the case ID after every `num_rows` rows or at random points if `random_insert` is True.
    """
    target_url: str = "https://forms.zoho.com/universityofmelbourne/form/NewRecord/thankyou"
    target_event: str = "createNewTab"
    caseid: int = 1
    caseid_list: list[int] = []
    
    if "caseid" in df.columns:
        df = df.drop(columns=["caseid"])

    for index, row in df.iterrows():
        caseid_list.append(caseid)
        if row["url"] == target_url or ("eventType" in df.columns and row["eventType"] == target_event):
            caseid += 1
    
    df.insert(0, "caseid", caseid_list)
    
    if random_insert:
        return insert_random_rows(df, num_rows=num_rows)
    else:
        return df

def insert_random_rows(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
    """
    Inserts "num_rows" random rows at each caseid transition point.
    """
    new_df_parts: list[pd.DataFrame] = []
    prev_caseid = df.iloc[0]["caseid"]
    
    for i in range(len(df)):
        if i > 0 and df.iloc[i]["caseid"] != prev_caseid:
            new_df_parts.extend(generate_random_rows(df, num_rows))
        new_df_parts.append(pd.DataFrame([df.iloc[i]]))
        prev_caseid = df.iloc[i]["caseid"]
    
    return pd.concat(new_df_parts, ignore_index=True)

def generate_random_rows(df: pd.DataFrame, num_rows: int) -> list[pd.DataFrame]:
    """
    Generates a list of random rows from existing DataFrame values, and adds random data for URL,
    content, and target.workbookName columns.
    """
    shuffled_rows: list[pd.DataFrame] = []
    columns_to_shuffle = [col for col in df.columns if col != "caseid"]
    
    # Predefined lists for random data
    random_actions = [
    "doubleClick", "drag", "closeWindow", "scroll", "click", "hover", "resize", "select", 
    "submit", "download", "upload", "edit", "view", "delete", "create", "open", "logIn", 
    "logOut", "search", "refresh", "approve", "reject"
    ]

    random_urls = [
        "https://example.com", "https://example.org", "https://example.net", "", " ", 
        "https://randomsite.com", "https://testsite.com", "https://anotherurl.com", 
        "https://sap.example.com", "https://service-now.example.com", "https://salesforce.example.com", 
        "https://jira.example.com", "https://confluence.example.com", "https://microsoft.com", 
        "https://office365.example.com", "https://slack.com", "https://zoom.us", 
        "https://google.com", "https://github.com", "https://linkedin.com", 
        "https://workday.example.com", "https://oracle.com", "https://adobe.com"
    ]
    random_content = [
        "Lorem ipsum dolor sit amet.", "This is a random content example.", 
        "Random text for data generation.", "", " "
    ]
    random_file_names = [
        "report_final.xlsx", "data_analysis.csv", "project_notes.pdf", "", " "
    ]
    
    for _ in range(num_rows):
        # Shuffle other columns' values
        random_row: dict = {col: random.choice(df[col].tolist()) for col in columns_to_shuffle}
        
        # Add random values for specific columns
        random_row["eventType"] = random.choice(random_actions)  # Random URL
        random_row["url"] = random.choice(random_urls)  # Random URL
        random_row["content"] = random.choice(random_content)  # Random personal content
        random_row["target.workbookName"] = random.choice(random_file_names)  # Random file name
        
        shuffled_rows.append(pd.DataFrame([random_row])) 
                                            
    return shuffled_rows


# Ensure the DataFrames are empty before adding data
srrt_plus_log = pd.DataFrame()  
# srrt_parallel_log = pd.DataFrame()

# Read the original CSV Logs from Leno et al
srrt_plus_log = pd.read_csv(file_path + srrt_plus_filename, encoding=text_encoding_method, sep=seperator)
srrt_parallel_log = pd.read_csv(file_path + srrt_parallel_filename, encoding=text_encoding_method, sep=seperator)

# Add Caseids for discovery and random noise
srrt_plus_log = add_caseid_column(srrt_plus_log,random_insert=True,num_rows=50)
srrt_parallel_log = add_caseid_column(srrt_parallel_log,random_insert=True,num_rows=50)
#print(srrt_plus_log)

# Storing Files
timestamp = datetime.now().strftime("%y%m%d_%H%M")
full_path_srrt_plus = file_path + f"{timestamp}_extended_" + srrt_plus_filename
full_path_srrt_parallel = file_path + f"{timestamp}_extended_" + srrt_parallel_filename
srrt_plus_log.to_csv(full_path_srrt_plus, index=False, sep=seperator) 
srrt_parallel_log.to_csv(full_path_srrt_parallel, index=False, sep=seperator)