# Motif Discovery in User Interaction Logs 

### Library Import

In [1]:
import pandas as pd
import numpy as np
import sys

import os

import time # just for dev purpose
from IPython.display import display # Just for displaying DF nicely

from itertools import product

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.patches import Rectangle
import datetime as dt

from util.util import *
from util.const import conceptNames

### Data Gathering
Read all files in the folder "validation" and the corresponding validation data containing the information about length and distribution of the motifs.

In [2]:
folder_path = "logs/smartRPA/validation/"
folder_path = "logs/smartRPA/percentageComparison/"

UILogValidation_filename = "validation_data.csv"
variableLenValidation_filename = "var_len_validation_data.csv"
percentagData_filename = "validationDataPercentage.csv"

varLenUILogs = []
UILogs = []
percentageLogs = []

# Getting the relevant files from the folder and sorting them into different lists for processing
for file in os.listdir(folder_path):
    if file.startswith("Log"):
        UILogs.append(file)
    elif file.startswith("Var"):
        varLenUILogs.append(file)
    elif file.startswith("LenLog"):
        percentageLogs.append(file)

# Read the validation data into two dataframes for processing
try:
    UILogValidationDF = pd.read_csv(folder_path + UILogValidation_filename)
except FileNotFoundError as e:
    print(f"Could not read {UILogValidation_filename} from the folder.\n{e}")

# Check if varliable motif data is present by checking if the var_len_validation_data.csv file exists
try:
    variableLenValidationDF = pd.read_csv(folder_path + variableLenValidation_filename)
    varLenDataAvailable = True
except FileNotFoundError as e:
    print(f"Could not read {variableLenValidation_filename} from the folder.\n{e}")
    varLenDataAvailable = False

# Check if data created for percentage based comparison is available
try:
    percentageValData = pd.read_csv(folder_path + percentagData_filename)
    percentageValAvailable = True
except FileNotFoundError as e:
    print(f"Could not read {percentagData_filename} from the folder.\n{e}")
    percentageValAvailable = False

experimentColumns = ["experimentID","uiLogName","variationPercentage","percentageMotifsOverLog","motifLength","windowSize",
                     "windowSizeMatch","motifsToBeDiscovered","motifsDiscovered","numberOfOccurrancesToBeDiscovered",
                     "OccurancesDiscovered","DiscoveryPercentage","alignmentAccuracy","executionTime",
                     "motifSpots","discoveredSpots","DiscoveryLoops"]
experimentResults = pd.DataFrame(columns=experimentColumns)

Could not read validation_data.csv from the folder.
[Errno 2] No such file or directory: 'logs/smartRPA/percentageComparison/validation_data.csv'
Could not read var_len_validation_data.csv from the folder.
[Errno 2] No such file or directory: 'logs/smartRPA/percentageComparison/var_len_validation_data.csv'


### Experiment Setup

1. Create a dataframe to store the experiment results
2. Conduct the experiment for all fixed length files
3. If variable length motifs are created, conduct the experiment for all variable length files

Discover motifs for fixed length motif UI Logs:

In [3]:
for i, log in enumerate(UILogs):
    file = pd.read_csv(folder_path + log)
    insertSpots = UILogValidationDF.loc[UILogValidationDF['Filename'] == log]["Index"]
    inserted_motif_spots = extract_numbers(insertSpots[insertSpots.index[0]])

    window_sizes = [10,25,30,50]
    # Encode the UI log using the function in util.py
    uiLog = encoding_UiLog(file)
    print(f"UI Log {log} is encoded")
    for j, size in enumerate(window_sizes):
        start_time = time.time()

        # Split the string by underscores
        # What does each position in the parts mean: 0: Log Type, 1: Sampling, 2: No. of Motifs, 3: No. of Occurrances, 4: Log Length, 5: Shuffle Percentage, 6: Reduction Percentage
        parts = log.split('_')
        experimentId = str(i)+"."+str(j)
        new_row = {'experimentID': experimentId, 'uiLogName': log, "variationPercentage":  parts[5], "motifsToBeDiscovered": parts[2], "numberOfOccurrancesToBeDiscovered": parts[3],
                    "motifSpots": inserted_motif_spots, "windowSize": size}
        
        inserted_motif_spots = extract_numbers(insertSpots[insertSpots.index[0]])
        #Looping
        k=0
        insert_overlap_all = []
        # Compare inserted values and discovered spots   
        
        while(k<5):
            # Discovery motifs in the dataset
            tm_matrix, event_series = discover_motifs(uiLog, size)

            motif_idx_tm = np.argsort(tm_matrix[:, 0])[0]
            nearest_neighbor_idx_tm = tm_matrix[motif_idx_tm, 1]
            top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=12)
            # If not motifs are discovered in the first round, break and add this as information
            if len(top_motifs[1][0]) == 0 and k == 0:
                insert_overlap_all = "No motifs discovered in the data"
                break
        
            insert_overlap, motif_overlap = compare_sets(set(inserted_motif_spots), set(top_motifs[1][0]), size)
            insert_overlap_all = insert_overlap_all + insert_overlap

            # Get indices for descending order by size (number of elements in each sub-array)

            sorted_indices = top_motifs[1][0].argsort(axis=0)[::-1]
            top_motifs_list = top_motifs[1][0][sorted_indices]

            # Reduce all remaining indexes by the window size, Reduces only the indexes that are after (higher) then the identified motif, because lower indexes are not moved
            # Does simulate a human checking the data and identifying equal processes
            inserted_motif_spots = [item for item in inserted_motif_spots if item not in set(insert_overlap)]
            for motif in motif_overlap:
                uiLog = pd.concat([uiLog.iloc[:motif],uiLog.iloc[motif+size:]],ignore_index=True)
                inserted_motif_spots = [(val - size) if val > motif else val for val in inserted_motif_spots]
                # Break Criteria: We have done the discovery 5 times (k++) or we have found all motifs (set k = 5)
                if len(inserted_motif_spots) == 0:
                    k = 5
                    break
            
            k+=1
            new_row.update({"DiscoveryLoops": k})
        
        end_time = time.time()
        new_row.update({"discoveredSpots": insert_overlap_all,"OccurancesDiscovered": len(insert_overlap_all), "executionTime": end_time - start_time, "windowSize": size})

        print(f"Discovery for experiment {experimentId} with file {log} and window size {size} finished.")
        # Append the new row to the DataFrame
        experimentResults = experimentResults._append(new_row, ignore_index=True)

Discover motifs for variable length motifs in UI Logs:

In [4]:
if varLenDataAvailable:
    for i, log in enumerate(varLenUILogs):
        file = pd.read_csv(folder_path + log)
        insertSpots = variableLenValidationDF.loc[variableLenValidationDF['Filename'] == log]["Index"]
        inserted_motif_spots = extract_numbers(insertSpots[insertSpots.index[0]])

        window_sizes = [10,25,30,50]
        # Encode the UI log using the function in util.py
        uiLog = encoding_UiLog(file)
        print(f"UI Log {log} is encoded")
        for j, size in enumerate(window_sizes):
            start_time = time.time()

            # Split the string by underscores
            # What does each position in the parts mean: 0: Log Type, 1: Sampling, 2: No. of Motifs, 3: No. of Occurrances, 4: Log Length, 5: Shuffle Percentage, 6: Reduction Percentage
            parts = log.split('_')
            experimentId = str(i)+"."+str(j)
            new_row = {'experimentID': experimentId, 'uiLogName': log, "variationPercentage":  parts[5], "motifsToBeDiscovered": parts[2], "numberOfOccurrancesToBeDiscovered": parts[3],
                        "motifSpots": inserted_motif_spots, "windowSize": size}
            
            inserted_motif_spots = extract_numbers(insertSpots[insertSpots.index[0]])
            #Looping
            k=0
            insert_overlap_all = []
            # Compare inserted values and discovered spots   
            
            while(k<5):
                # Discovery motifs in the dataset
                tm_matrix, event_series = discover_motifs(uiLog, size)

                motif_idx_tm = np.argsort(tm_matrix[:, 0])[0]
                nearest_neighbor_idx_tm = tm_matrix[motif_idx_tm, 1]
                top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=12)
                # If not motifs are discovered in the first round, break and add this as information
                if len(top_motifs[1][0]) == 0 and k == 0:
                    insert_overlap_all = "No motifs discovered in the data"
                    break
            
                insert_overlap, motif_overlap = compare_sets(set(inserted_motif_spots), set(top_motifs[1][0]), size)
                insert_overlap_all = insert_overlap_all + insert_overlap

                # Get indices for descending order by size (number of elements in each sub-array)

                sorted_indices = top_motifs[1][0].argsort(axis=0)[::-1]
                top_motifs_list = top_motifs[1][0][sorted_indices]

                # Reduce all remaining indexes by the window size, Reduces only the indexes that are after (higher) then the identified motif, because lower indexes are not moved
                # Does simulate a human checking the data and identifying equal processes
                inserted_motif_spots = [item for item in inserted_motif_spots if item not in set(insert_overlap)]
                for motif in motif_overlap:
                    uiLog = pd.concat([uiLog.iloc[:motif],uiLog.iloc[motif+size:]],ignore_index=True)
                    inserted_motif_spots = [(val - size) if val > motif else val for val in inserted_motif_spots]
                    # Break Criteria: We have done the discovery 5 times (k++) or we have found all motifs (set k = 5)
                    if len(inserted_motif_spots) == 0:
                        k = 5
                        break
                
                k+=1
                new_row.update({"DiscoveryLoops": k})
            
            end_time = time.time()
            new_row.update({"discoveredSpots": insert_overlap_all,"OccurancesDiscovered": len(insert_overlap_all), 
                            "executionTime": end_time - start_time, "windowSize": size})

            print(f"Discovery for experiment {experimentId} with file {log} and window size {size} finished.")
            # Append the new row to the DataFrame
            experimentResults = experimentResults._append(new_row, ignore_index=True)

### Percentage Based Discovery

In [94]:
window_sizes = [5,10,15,25,30,35]
experimentResults = pd.DataFrame(columns=experimentColumns)

for i, log in enumerate(percentageLogs):
    file = pd.read_csv(folder_path + log)
    # Getting the row with the uiLog name from the validation data
    comparisonVariables = percentageValData.loc[percentageValData['uiLogName'] == log]
    # Where the motifs were initially added
    insertSpots = comparisonVariables["motifSpots"]

    # Encode the UI log
    uiLog = encoding_UiLog(file)
    print(f"UI Log {log} is encoded")

    # Where are the motifs actually in the dataframe
    inserted_motif_spots = extract_numbers(insertSpots[insertSpots.index[0]])

    for j, size in enumerate(window_sizes):
        start_time = time.time()
        experimentId = str(i)+"."+str(j)
        new_row = {'experimentID': experimentId, 'uiLogName': log, "variationPercentage": comparisonVariables["variationPercentage"][comparisonVariables.index[0]], 
                   "motifLength": comparisonVariables["motifLength"][comparisonVariables.index[0]], 
                   "windowSize": size, "windowSizeMatch": comparisonVariables["motifLength"][comparisonVariables.index[0]]-size,
                   "percentageMotifsOverLog": comparisonVariables["percentageMotifsOverLog"][comparisonVariables.index[0]],
                   "motifsToBeDiscovered": comparisonVariables["motifsToBeDiscovered"][comparisonVariables.index[0]], 
                   "motifsDiscovered": 1, "numberOfOccurrancesToBeDiscovered": comparisonVariables["numberOfOccurrancesToBeDiscovered"][comparisonVariables.index[0]],
                    "motifSpots": inserted_motif_spots}
        

        # Discovery motifs in the dataset
        tm_matrix = None
        event_series = None
        tm_matrix, event_series = discover_motifs(uiLog, size)

        matches_match = True # Do While Construct with condition checked after each run
        maximalNoOfMatches = 10 # Initiating by looking for 10 occurances
        while matches_match: # As long as we have found the same amount of motifs as expected we increase the max match
            top_motifs = None
            top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=maximalNoOfMatches)
            insert_overlap, motif_overlap, overlapDF = compare_sets(set(inserted_motif_spots), set(top_motifs[1][0]), (size/2))
            matches_match = (len(insert_overlap) >= (maximalNoOfMatches*0.8))
            maximalNoOfMatches += 10

        end_time = time.time()  
        new_row.update({"discoveredSpots": motif_overlap, "OccurancesDiscovered": len(insert_overlap), 
                        "executionTime": end_time - start_time, "windowSize": size, 
                        "DiscoveryPercentage": len(insert_overlap) / comparisonVariables["motifsToBeDiscovered"][comparisonVariables.index[0]] * 100,
                        "alignmentAccuracy": overlapDF['alignmentAccuracy'].mean()})
        experimentResults = experimentResults._append(new_row, ignore_index=True)

experimentResults.to_csv(folder_path + "experimentResults.csv")

UI Log LenLog_1_1_10_10_10_1000.csv is encoded


  experimentResults = experimentResults._append(new_row, ignore_index=True)
  experimentResults = experimentResults._append(new_row, ignore_index=True)


UI Log LenLog_1_1_10_10_1_10000.csv is encoded
UI Log LenLog_1_1_10_10_2-5_4000.csv is encoded
UI Log LenLog_1_1_10_10_5_2000.csv is encoded
UI Log LenLog_1_1_10_15_10_1500.csv is encoded
UI Log LenLog_1_1_10_15_1_15000.csv is encoded
UI Log LenLog_1_1_10_15_2-5_6000.csv is encoded
UI Log LenLog_1_1_10_15_5_3000.csv is encoded
UI Log LenLog_1_1_10_20_10_2000.csv is encoded
UI Log LenLog_1_1_10_20_1_20000.csv is encoded
UI Log LenLog_1_1_10_20_2-5_8000.csv is encoded
UI Log LenLog_1_1_10_20_5_4000.csv is encoded
UI Log LenLog_1_1_10_25_10_2500.csv is encoded
UI Log LenLog_1_1_10_25_1_25000.csv is encoded
UI Log LenLog_1_1_10_25_2-5_10000.csv is encoded
UI Log LenLog_1_1_10_25_5_5000.csv is encoded
UI Log LenLog_1_1_10_5_10_500.csv is encoded
UI Log LenLog_1_1_10_5_1_5000.csv is encoded
UI Log LenLog_1_1_10_5_2-5_2000.csv is encoded
UI Log LenLog_1_1_10_5_5_1000.csv is encoded
UI Log LenLog_1_1_15_10_10_1500.csv is encoded
UI Log LenLog_1_1_15_10_1_15000.csv is encoded
UI Log LenLog_1_1_

### Verification of single Event Logs

In [91]:
file = pd.read_csv(folder_path + "LenLog_1_1_30_5_2-5_6000.csv")
uiLog = encoding_UiLog(file, cooccuranceBased=True)

size = 15
#Discovery motifs in the dataset
tm_matrix, event_series = discover_motifs(uiLog, size)

motif_idx_tm = np.argsort(tm_matrix[:, 0])[0]
nearest_neighbor_idx_tm = tm_matrix[motif_idx_tm, 1]
top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=10)
top_motifs

(array([[0.        , 0.98718044, 2.20448408, 2.21605981, 2.35616234,
         2.46303571, 2.51940822, 2.61822832, 2.71273244, 2.74764111]]),
 array([[1308, 2009, 4575, 3163, 3266, 3782, 5215, 4211, 2413, 1584]],
       dtype=int64))

In [92]:
liste = [5950, 5695, 5471, 5127, 4969, 4829, 4685, 4645, 4577, 4213, 4201, 3835, 3685, 3674, 3470, 3323, 3125, 3029, 2921, 2645, 2567, 2280, 2167, 1660, 1300, 1172, 1094, 704, 693, 425]

top_motifs = stumpy.motifs(T=event_series, P=tm_matrix[:,0], min_neighbors=1, max_matches=10)

insert_overlap, motif_overlap, overlapDF = compare_sets(set(liste), set(top_motifs[1][0]), (size/2))
print(overlapDF['alignmentAccuracy'].mean())
motif_overlap

2.0


[4575, 4211]