In [1]:
import pandas as pd
import pm4py as pm
import numpy as np

training_file_path = '12689204\check_writeGroundtruth_out.csv'




### load CSV with pandas and pm4py

pm4py gives two extra column: @@index and @@case_index

In [4]:
def format_event_log(event_log):
    event_log = pm.format_dataframe(
        event_log, 
        case_id='case concept:name', 
        activity_key='event concept:name', 
        timestamp_key='event time:timestamp'
    )
    return event_log
event_log = pd.read_csv(training_file_path, sep=',')
event_log = format_event_log(event_log)
event_log.to_csv("formatted_event_log.csv")
# event_log

  event_log = pm.format_dataframe(


In [4]:
def findPrintInfo():
    num_events = len(event_log)
    num_cases = len(event_log['case concept:name'].unique())
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))
    print('=======================================================')

    start_activities = pm.get_start_activities(event_log)
    end_activities = pm.get_end_activities(event_log)
    print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))
    print('=======================================================')

    print('events: ')
    print(event_log['event concept:name'].unique())

### Helper methods

In [9]:
# assume sorted by caseID and time
def caseHasNextEvent(df, index):
    if index >= len(df) - 1:
        return False
    if df.loc[index, 'case concept:name'] == df.loc[index+1, 'case concept:name']:
        return True
    return False

def writeGroundtruth(df):
    df = df.sort_values(by=['case concept:name','event time:timestamp'])
    #add new columns containing the name of the next event in the case and the time when it happens
    df = df.assign(ground_truth_activity='')
    df = df.assign(ground_truth_time='')

    for ind in df.index:
        if caseHasNextEvent(df, ind):
            df.at[ind,'ground_truth_activity'] = df.loc[ind+1,'event concept:name']
            df.at[ind,'ground_truth_time'] = df.loc[ind+1, 'event time:timestamp']
        else:
            df.at[ind,'ground_truth_activity'] = None
            df.at[ind,'ground_truth_time'] = None
    return df

df_event = writeGroundtruth(event_log)
df_event.to_csv('check_writeGroundtruth_out.csv')

In [7]:
def computeTimeDifference(df):
    # df['time_until_next_event'] = 0  # initialize new column with zeros
    df = df.assign(time_until_next_event=0) # initialize new column

    # iterate over each row of the dataframe
    for i, row in df.iterrows():        
        # check if there is a next row with the same case
        if caseHasNextEvent(df, i):
            nextTime = df.loc[i+1, 'event time:timestamp']
            currentTime = row['event time:timestamp']
            timeDiff = nextTime - currentTime
            df.at[i, 'time_until_next_event'] = timeDiff.total_seconds()
    
    return df



In [8]:
df_timeDiff = computeTimeDifference(df_event)
df_timeDiff.to_csv("check_computeTimeDifference.csv")
df_timeDiff.head(5)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,case:concept:name,concept:name,time:timestamp,@@index,@@case_index,ground_truth_activity,ground_truth_time,time_until_next_event
0,0,173688,2011-01-10 00:38:44.546000+00:00,20000,112.0,A_SUBMITTED,COMPLETE,2011-01-10 00:38:44.546000+00:00,173688,A_SUBMITTED,2011-01-10 00:38:44.546000+00:00,0,0,A_PARTLYSUBMITTED,2011-01-10 00:38:44.880000+00:00,0.334
1,1,173688,2011-01-10 00:38:44.546000+00:00,20000,112.0,A_PARTLYSUBMITTED,COMPLETE,2011-01-10 00:38:44.880000+00:00,173688,A_PARTLYSUBMITTED,2011-01-10 00:38:44.880000+00:00,1,0,A_PREACCEPTED,2011-01-10 00:39:37.906000+00:00,53.026
2,2,173688,2011-01-10 00:38:44.546000+00:00,20000,112.0,A_PREACCEPTED,COMPLETE,2011-01-10 00:39:37.906000+00:00,173688,A_PREACCEPTED,2011-01-10 00:39:37.906000+00:00,2,0,W_Completeren aanvraag,2011-01-10 00:39:38.875000+00:00,0.969
3,3,173688,2011-01-10 00:38:44.546000+00:00,20000,112.0,W_Completeren aanvraag,SCHEDULE,2011-01-10 00:39:38.875000+00:00,173688,W_Completeren aanvraag,2011-01-10 00:39:38.875000+00:00,3,0,W_Completeren aanvraag,2011-01-10 11:36:46.437000+00:00,39427.562
4,4,173688,2011-01-10 00:38:44.546000+00:00,20000,,W_Completeren aanvraag,START,2011-01-10 11:36:46.437000+00:00,173688,W_Completeren aanvraag,2011-01-10 11:36:46.437000+00:00,4,0,A_ACCEPTED,2011-01-10 11:42:43.308000+00:00,356.871


In [9]:
#new collumn that indexes events within a case

def addEventNumber(df):
    # df['event_number'] = 0  # initialize new column with zeros
    df = df.assign(event_number=0)
    
    # iterate over each unique case
    for case in df['case concept:name'].unique():
        case_df = df[df['case concept:name'] == case].copy()  # subset dataframe for current case
        size = len(case_df)
        case_df['event_number'] = np.arange(size)  # assign incrementing event numbers
        df.update(case_df)  # update original dataframe with new event numbers
    
    return df



In [10]:
df_eventNum = addEventNumber(df_timeDiff)
df_eventNum.to_csv('check_addEventNumber.csv')
df_eventNum.head(10)


  df.update(case_df)  # update original dataframe with new event numbers


Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,case:concept:name,concept:name,time:timestamp,@@index,@@case_index,ground_truth_activity,ground_truth_time,time_until_next_event,event_number
0,0.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,112.0,A_SUBMITTED,COMPLETE,2011-01-10 00:38:44.546000+00:00,173688,A_SUBMITTED,2011-01-10 00:38:44.546000+00:00,0.0,0.0,A_PARTLYSUBMITTED,2011-01-10 00:38:44.880000+00:00,0.334,0.0
1,1.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,112.0,A_PARTLYSUBMITTED,COMPLETE,2011-01-10 00:38:44.880000+00:00,173688,A_PARTLYSUBMITTED,2011-01-10 00:38:44.880000+00:00,1.0,0.0,A_PREACCEPTED,2011-01-10 00:39:37.906000+00:00,53.026,1.0
2,2.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,112.0,A_PREACCEPTED,COMPLETE,2011-01-10 00:39:37.906000+00:00,173688,A_PREACCEPTED,2011-01-10 00:39:37.906000+00:00,2.0,0.0,W_Completeren aanvraag,2011-01-10 00:39:38.875000+00:00,0.969,2.0
3,3.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,112.0,W_Completeren aanvraag,SCHEDULE,2011-01-10 00:39:38.875000+00:00,173688,W_Completeren aanvraag,2011-01-10 00:39:38.875000+00:00,3.0,0.0,W_Completeren aanvraag,2011-01-10 11:36:46.437000+00:00,39427.562,3.0
4,4.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,,W_Completeren aanvraag,START,2011-01-10 11:36:46.437000+00:00,173688,W_Completeren aanvraag,2011-01-10 11:36:46.437000+00:00,4.0,0.0,A_ACCEPTED,2011-01-10 11:42:43.308000+00:00,356.871,4.0
5,5.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,10862.0,A_ACCEPTED,COMPLETE,2011-01-10 11:42:43.308000+00:00,173688,A_ACCEPTED,2011-01-10 11:42:43.308000+00:00,5.0,0.0,A_FINALIZED,2011-01-10 11:45:09.243000+00:00,145.935,5.0
6,7.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,10862.0,A_FINALIZED,COMPLETE,2011-01-10 11:45:09.243000+00:00,173688,A_FINALIZED,2011-01-10 11:45:09.243000+00:00,6.0,0.0,O_SELECTED,2011-01-10 11:45:09.243000+00:00,0.0,6.0
7,6.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,10862.0,O_SELECTED,COMPLETE,2011-01-10 11:45:09.243000+00:00,173688,O_SELECTED,2011-01-10 11:45:09.243000+00:00,7.0,0.0,O_CREATED,2011-01-10 11:45:11.197000+00:00,1.954,7.0
8,8.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,10862.0,O_CREATED,COMPLETE,2011-01-10 11:45:11.197000+00:00,173688,O_CREATED,2011-01-10 11:45:11.197000+00:00,8.0,0.0,O_SENT,2011-01-10 11:45:11.380000+00:00,0.183,8.0
9,9.0,173688.0,2011-01-10 00:38:44.546000+00:00,20000.0,10862.0,O_SENT,COMPLETE,2011-01-10 11:45:11.380000+00:00,173688,O_SENT,2011-01-10 11:45:11.380000+00:00,9.0,0.0,W_Nabellen offertes,2011-01-10 11:45:11.554000+00:00,0.174,9.0


In [11]:
# obsolete method; done alternatively in createNaiveTimePredictor
def getLongestTraceLength(df):
    traceLengths = df.groupby('case concept:name').size()  # count number of events for each trace
    longest_trace_length = traceLengths.max()  # get maximum number of events
    return longest_trace_length

longestTrace = getLongestTraceLength(df_event)


In [12]:
def createNaiveTimePredictor(df):
    # group by event number and calculate mean of time_until_next_event
    df_naiveTimePredictor = df.groupby('event_number')['time_until_next_event'].mean().reset_index()
    return df_naiveTimePredictor


In [13]:
naiveTimePredictor = createNaiveTimePredictor(df_eventNum)
naiveTimePredictor.head(30)

Unnamed: 0,event_number,time_until_next_event
0,0.0,0.3552432
1,1.0,39.211
2,2.0,137601.8
3,3.0,985441.8
4,4.0,609.4284
5,5.0,5185.245
6,6.0,983.0827
7,7.0,564885.2
8,8.0,377209.9
9,9.0,1261161.0


In [14]:
df_eventNum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   eventID                     1000 non-null   float64            
 1   case concept:name           1000 non-null   float64            
 2   case REG_DATE               1000 non-null   datetime64[ns, UTC]
 3   case AMOUNT_REQ             1000 non-null   float64            
 4   event org:resource          809 non-null    float64            
 5   event concept:name          1000 non-null   object             
 6   event lifecycle:transition  1000 non-null   object             
 7   event time:timestamp        1000 non-null   datetime64[ns, UTC]
 8   case:concept:name           1000 non-null   object             
 9   concept:name                1000 non-null   object             
 10  time:timestamp              1000 non-null   datetime64[ns, UT

## Naive estimators
# Naive event estimator

## Naive predictor base on mode and mean
1. for each row find next activity of the case and its timestamp
2. compute the time it take for the next event to be log in the db
3. for each activity find the most common next activity (mode)
4. for each activity find the average time between next activity
5. Have 3. and 4. in a DataFrame
6. base on the current activity write the prediction of the next activity and time it will take

In [42]:

df_naive_predictor_dict = writeGroundtruth(event_log)
df_naive_predictor_dict = computeTimeDifference(df_naive_predictor_dict)
df_naive_predictor_result = df_naive_predictor_dict.copy()
df_naive_predictor_dict = df_naive_predictor_dict.groupby(['event concept:name']).agg(
    naive_prediction_activity = ('ground_truth_activity', pd.Series.mode),
    naive_prediction_time = ('time_until_next_event', 'mean')
)
# df_naive_predictor_result
# df_naive_predictor_dict



In [43]:

df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_activity='')
df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_time=0)
for i, r in df_naive_predictor_result.iterrows():
    this_event = r['event concept:name']
    next_event = df_naive_predictor_dict.loc[this_event,'naive_prediction_activity']
    next_event_time = df_naive_predictor_dict.loc[this_event,'naive_prediction_time']
    df_naive_predictor_result.at[i,'naive_prediction_activity'] = next_event
    df_naive_predictor_result.at[i,'naive_prediction_time'] = next_event_time
df_naive_predictor_result.to_csv('naive_predictor_result.csv')
# df_naive_predictor_result
