In [29]:
import pandas as pd
import pm4py as pm
import numpy as np

training_file_path = 'formatted_event_log.csv'




### load CSV with pandas and pm4py

pm4py gives two extra column: @@index and @@case_index

In [30]:
def format_event_log(event_log):
    event_log = pm.format_dataframe(
        event_log, 
        case_id='case concept:name', 
        activity_key='event concept:name', 
        timestamp_key='event time:timestamp'
    )
    return event_log
event_log = pd.read_csv(training_file_path, sep=',')
event_log = format_event_log(event_log)
event_log.to_csv("formatted_event_log.csv")
#event_log

  event_log = pm.format_dataframe(


In [31]:
def findPrintInfo():
    num_events = len(event_log)
    num_cases = len(event_log['case concept:name'].unique())
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))
    print('=======================================================')

    start_activities = pm.get_start_activities(event_log)
    end_activities = pm.get_end_activities(event_log)
    print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))
    print('=======================================================')

    print('events: ')
    print(event_log['event concept:name'].unique())

### Helper methods

In [32]:
# assume sorted by caseID and time
def caseHasNextEvent(df, index):
    if index >= len(df) - 1:
        return False
    if df.loc[index, 'case concept:name'] == df.loc[index+1, 'case concept:name']:
        return True
    return False

def writeGroundtruth(df):
    df = df.sort_values(by=['case concept:name','event time:timestamp'])
    #add new columns containing the name of the next event in the case and the time when it happens
    df = df.assign(ground_truth_activity='')
    df = df.assign(ground_truth_time='')

    for ind in df.index:
        if caseHasNextEvent(df, ind):
            df.at[ind,'ground_truth_activity'] = df.loc[ind+1,'event concept:name']
            df.at[ind,'ground_truth_time'] = df.loc[ind+1, 'event time:timestamp']
        else:
            df.at[ind,'ground_truth_activity'] = None
            df.at[ind,'ground_truth_time'] = None
    return df

#df_event = writeGroundtruth(event_log)
#df_event.to_csv('check_writeGroundtruth_out.csv')

In [33]:
def computeTimeDifference(df):
    # df['time_until_next_event'] = 0  # initialize new column with zeros
    df = df.assign(time_until_next_event=0) # initialize new column

    # iterate over each row of the dataframe
    for i, row in df.iterrows():        
        # check if there is a next row with the same case
        if caseHasNextEvent(df, i):
            nextTime = df.loc[i+1, 'event time:timestamp']
            currentTime = row['event time:timestamp']
            timeDiff = nextTime - currentTime
            df.at[i, 'time_until_next_event'] = timeDiff.total_seconds()
    
    return df



## Naive predictor base on mode and mean
1. for each row find next activity of the case and its timestamp
2. compute the time it take for the next event to be log in the db
3. for each activity find the most common next activity (mode)
4. for each activity find the average time between next activity
5. Have 3. and 4. in a DataFrame
6. base on the current activity write the prediction of the next activity and time it will take

In [34]:

df_naive_predictor_dict = writeGroundtruth(event_log)
df_naive_predictor_dict = computeTimeDifference(df_naive_predictor_dict)
df_naive_predictor_result = df_naive_predictor_dict.copy()
df_naive_predictor_dict = df_naive_predictor_dict.groupby(['event concept:name']).agg(
    naive_prediction_activity = ('ground_truth_activity', pd.Series.mode),
    naive_prediction_time = ('time_until_next_event', 'mean')
)
# df_naive_predictor_result
# df_naive_predictor_dict



In [35]:

df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_activity='')
df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_time=0)
for i, r in df_naive_predictor_result.iterrows():
    this_event = r['event concept:name']
    next_event = df_naive_predictor_dict.loc[this_event,'naive_prediction_activity']
    next_event_time = df_naive_predictor_dict.loc[this_event,'naive_prediction_time']
    df_naive_predictor_result.at[i,'naive_prediction_activity'] = next_event
    df_naive_predictor_result.at[i,'naive_prediction_time'] = next_event_time
df_naive_predictor_result.to_csv('naive_predictor_result.csv')
# df_naive_predictor_result
