Author: Thais Rodrigues Neubauer

This code trains and evaluate (with MAPE) regressors for the remaining time prediction task. Adapt the value of the variables in section "Global variables" to properly define paths and to control the characteristics of the model generation pipeline.

The code was created to be executed using Google Colab. If that's not the case, delete the last two lines of the "Import required libraries" section.

#Imports required libraries

In [1]:
import pickle
import math
import pandas as pd
import numpy as np
import networkx as nx

###########################
# When using Google Colab #
###########################
from google.colab import files, drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Global variables


**🔴🔴🔴 IMPORTANT INFO 🔴🔴🔴**

<font color='red'> *   Activity nodes are being linked to event nodes, so if you don't want event nodes, link the activities to something else (e.g. case nodes)

<font color='red'> *   Everything related to VARIANTS is commented because this option is not included in the resulting filename


In [None]:
LOG_DIR = '/content/gdrive/My Drive/ProcessMiningResearch/Interactive_trace_clustering/experiments_log_incidentes/rt_prediction/LSTM/data/'
LOG_FILENAME = 'incidents-time_annotated'
#'incident_evt_log-processed1-classiclogformat' #'helpdesk-time_annotated'

CASE_ID = 'number'        #'CaseID'
ACTIVITY_COL = 'activity' #'ActivityID'
TS_COL = 'sys_updated_on' #'CompleteTimestamp'


GRAPHS_DIR = '/content/gdrive/My Drive/ProcessMiningResearch/Interactive_trace_clustering/experiments_log_incidentes/preprocess/graph_embeddings/results/graph/'

TRAIN_PERC = 2/3

EVENT_NODES = [True]
EVENT_ID = 'EventID'

EVENT_POSITION = [False, True]
EVENT_POS_COL = 'PrefixLen'

TIME_NODES = [False, True]
TIME_ANNOT_N_BUCKETS = 20
TIME_ANNOT1 = 'TimeSincePreviousEvent'
TIME_ANNOT2 = 'TimeSinceCaseStarted'
TIME_ANNOT3 = 'Weekday'
TIME_ANNOT4 = 'SameDayTime'

CASE_ATTRIB_NODES = [False, True]
CASE_ATTRIB_COLS = ['category'] #if attrib_nodes is false, this doesn't matter


#VARIANT_NODES = [False]
#VARIANT_COL = 'trace'
#SEQUENCES_FILE_PATH = '/content/gdrive/My Drive/ProcessMiningResearch/Interactive_trace_clustering/datasets/incidentlog/incident_evt_log-processed1-sequences.csv'

#Aux function


In [None]:
def load_log():
  log = pd.read_csv(f'{LOG_DIR}{LOG_FILENAME}.csv')

  if '0' in log.columns:
    log = log.drop(columns=['0'])
  if 0 in log.columns:
    log = log.drop(columns=[0])

  log[TS_COL] = pd.to_datetime(log[TS_COL])

  if 'PrefixLen' in log.columns:
    log = log.sort_values(by=[TS_COL, CASE_ID, 'PrefixLen']).reset_index(drop=True)
  else:
    log = log.sort_values(by=[TS_COL, CASE_ID]).reset_index(drop=True)
  return log

In [None]:
def create_edges(process_case, graph, event_nodes, event_position, time_annot_nodes, case_attrib_nodes):
  graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}',f'activity#{str(process_case[ACTIVITY_COL])}',1]])
  if event_nodes:
    graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}',f'case#{str(process_case[CASE_ID])}',1]])
    #EVENT POSITION AND EVENT TIME ANNOTATIONS' NODES CAN ONLY EXIST IF THE EVENT NODE EXISTS
    if event_position:
      graph.add_weighted_edges_from([[f'eventpos#{str(process_case[EVENT_POS_COL])}',f'event#{str(process_case[EVENT_ID])}',1]])
    if time_annot_nodes:
      graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}','tsle#'+str(process_case[TIME_ANNOT1+'_BIN']),1]])
      graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}','tscs#'+str(process_case[TIME_ANNOT2+'_BIN']),1]])
      graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}','weekday#'+str(process_case[TIME_ANNOT3]),1]])
      graph.add_weighted_edges_from([[f'event#{str(process_case[EVENT_ID])}','hofday#'+str(process_case[TIME_ANNOT4+'_BIN']),1]])
  #if VARIANT_NODES_OP:
  #  GRAPH.add_weighted_edges_from([[process_case[CASE_ID],'variant#'+process_case[VARIANT_COL],1]])
  if case_attrib_nodes:
    for col in CASE_ATTRIB_COLS:
      graph.add_weighted_edges_from([[f'case#{str(process_case[CASE_ID])}',f'{col}#{str(process_case[col])}',1]])

In [None]:
def check_event_id_col():
  global LOG
  if 'EventID' not in LOG.columns:
    LOG = LOG.reset_index()
    if '0' in LOG.columns:
      LOG = LOG.rename(columns = {'0':'EventID'})
    else:
      LOG = LOG.rename(columns = {'index':'EventID'})


def check_time_annot_bins():
  global LOG
  if TIME_ANNOT1+'_BIN' not in LOG.columns:
    for col in [TIME_ANNOT1, TIME_ANNOT2]:
      LOG[col+'_BIN'] = pd.cut(LOG[col], bins=TIME_ANNOT_N_BUCKETS, labels=False)
      #LOG[col+'_QBIN'] = pd.qcut(LOG[col], q=int(math.floor(LOG.shape[0]/TIME_ANNOT_N_BUCKETS)), duplicates='drop', labels=False)
    LOG[TIME_ANNOT4+'_BIN'] = (LOG[TIME_ANNOT4]/60/60).astype(int)


#def check_variant_col():
#  global LOG
#  if VARIANT_COL not in LOG.columns:
#    seq = pd.read_csv(SEQUENCES_FILE_PATH)
#    LOG = LOG.merge(seq).fillna('') #fillna to avoid "na" nodes in the graph

#Create and save graphs

In [None]:
LOG = load_log()

cases = sorted(list(LOG[CASE_ID].unique()))
TRAIN_CASES = list(cases)[:int(len(cases)*TRAIN_PERC)]
LOG = LOG[LOG[CASE_ID].isin(TRAIN_CASES)]


for event_nodes in EVENT_NODES:
  for event_position in EVENT_POSITION:
    for time_annot_nodes in TIME_NODES:
      for case_attrib_nodes in CASE_ATTRIB_NODES:
        #for VARIANT_NODES_OP in VARIANT_NODES:
          graph_filename = f'{LOG_FILENAME}--evnt{event_nodes}_evntpos{event_position}_times{time_annot_nodes}_attribs{case_attrib_nodes}.pickle'
          print(graph_filename)

          if event_nodes: check_event_id_col()
          if time_annot_nodes: check_time_annot_bins()
          #if VARIANT_NODES_OP: check_variant_col()

          graph = nx.Graph()
          LOG.apply(create_edges, args=(graph, event_nodes, event_position, time_annot_nodes, case_attrib_nodes), axis=1)

          print(graph.number_of_nodes(), graph.number_of_edges())

          pickle.dump(graph, open(f'{GRAPHS_DIR}{graph_filename}', 'wb'))

incidents-time_annotated--evntTrue_evntposFalse_timesFalse_attribsTrue.pickle
115305 214962


In [None]:
#nx.draw(GRAPH) #not working for big graphs
#nx.draw_networkx(GRAPH)
#import matplotlib.pyplot as plt
#limits = plt.axis("off")
#https://networkx.org/documentation/stable/reference/drawing.html