## Import

In [248]:
import pandas as pd
import pm4py as pm
import numpy as np
import psutil as pu

from sklearn.linear_model import LinearRegression

training_file_path = 'BPI_Challenge_2012.XE-training.csv'

### load CSV with pandas and pm4py

pm4py gives two extra column: @@index and @@case_index

In [249]:
def format_event_log(path):
    event_log = pd.read_csv(path, sep=',')
    event_log = pm.format_dataframe(
        event_log, 
        case_id='case concept:name', 
        activity_key='event concept:name', 
        timestamp_key='event time:timestamp'
    )
    return event_log
#event_log

### Helper methods

Function that adds the Ground truth collumns for event and time of the event
- Every value in the ground truth collumns is the actual next event or time of the next evet in the data set
- Used for naive simulator
- Used for verification purposes

In [250]:
# assume sorted by caseID and time
def caseHasNextEvent(df, index):
    if index >= len(df) - 1:
        return False
    if df.loc[index, 'case concept:name'] == df.loc[index+1, 'case concept:name']:
        return True
    return False

def writeGroundtruth(df):
    df = df.sort_values(by=['case concept:name','event time:timestamp'])
    #add new columns containing the name of the next event in the case and the time when it happens
    df = df.assign(ground_truth_activity='')
    df = df.assign(ground_truth_time='')

    for ind in df.index:
        if caseHasNextEvent(df, ind):
            df.at[ind,'ground_truth_activity'] = df.loc[ind+1,'event concept:name']
            df.at[ind,'ground_truth_time'] = df.loc[ind+1, 'event time:timestamp']
        else:
            df.at[ind,'ground_truth_activity'] = None
            df.at[ind,'ground_truth_time'] = None
    return df

#df_event = writeGroundtruth(event_log)
#df_event.to_csv('check_writeGroundtruth_out.csv')

Helper method for naive estimators

In [251]:
def computeTimeDifference(df):
    # df['time_until_next_event'] = 0  # initialize new column with zeros
    df = df.assign(time_until_next_event=0) # initialize new column

    # iterate over each row of the dataframe
    for i, row in df.iterrows():        
        # check if there is a next row with the same case
        if caseHasNextEvent(df, i):
            nextTime = df.loc[i+1, 'event time:timestamp']
            currentTime = row['event time:timestamp']
            timeDiff = nextTime - currentTime
            df.at[i, 'time_until_next_event'] = timeDiff.total_seconds()
    
    return df



Function that splits the data set into separate traces
- Parameter $traces$ is a list containing all the individual traces
- Each trace in traces is a list containing all the events in the trace in the order that they happen
- Helper method for prefix extraction

In [252]:
def split_into_traces(event_log):
    traces = []
    last_trace = []
    for j in range (0, event_log.shape[0] - 1):
        if event_log["case concept:name"][j] == event_log["case concept:name"][j + 1]:
            last_trace.append(event_log["event concept:name"][j])
        else:
            last_trace.append(event_log["event concept:name"][j])
            traces.append(last_trace)
            last_trace = []   
    last_trace.append(event_log["event concept:name"][event_log.shape[0] - 1])
    traces.append(last_trace)
    return traces

def split_into_traecs2(event_log):
    traces = []
    last_location = 0
    for j in range (0, event_log.shape[0] - 1):
        if not event_log["case concept:name"][j] == event_log["case concept:name"][j + 1]:
            traces.append(event_log.loc[last_location : j])
            last_location = j + 1
    traces.append(event_log.loc[last_location : event_log.shape[0] - 1])
    return traces

Prefix extraction
- extract event lists of odd length like 1, 3, 5, 7, 9, 11, 13, 15 from the traces for prediction
- and the ground truth
- store them in a list of prefixes

In [253]:
def prefix_extraction(traces, prefix_lengths):
    prefixes = []
    for trace in traces:
        for length in prefix_lengths:
            if trace.shape[0] > length:
                prefixes.append(trace.loc[:length - 1])
    return prefixes

Agregation encoding
- endoce traces into numerical data for prediction
- adds the ground truth at the end of the encoded prefix

In [254]:
def create_df(event_log):
    events = []
    events = event_log['event concept:name'].unique()

    events_dict = {'ground_truth' : []}
    for event in events:
        events_dict[event] = []

    return pd.DataFrame(events_dict)

def aggregation_encoding(prefixes, event_log):
    aggregation_encoding = create_df(event_log)
    new_encoding = ['truth', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    index = 0
    for prefix in prefixes:
        aggregation_encoding.loc[index] = new_encoding
        aggregation_encoding['ground_truth'][index] = prefix['ground_truth_activity'][len(prefix.index) - 1]
        for event in prefix["event concept:name"]:
            aggregation_encoding[event][index] = aggregation_encoding[event][index] + 1
        index += 1
    aggregation_encoding.to_csv("aggregation_encoding.csv")
    return aggregation_encoding

## Multivalue Regression

In [255]:
def multivalueRegression(df_encoded_train, df_encoded_test):
    dependent_Variable = 'GTE'
    independent_Variable = df_encoded_train.colums.tolist()
    independent_Variable = independent_Variable.remove(dependent_Variable)

    X_train = df_encoded_train[independent_Variable].values
    Y_train = df_encoded_train[dependent_Variable].values

    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)

## Naive predictor base on mode and mean
1. for each row find next activity of the case and its timestamp
2. compute the time it take for the next event to be log in the db
3. for each activity find the most common next activity (mode)
4. for each activity find the average time between next activity
5. Have 3. and 4. in a DataFrame
6. base on the current activity write the prediction of the next activity and time it will take

In [256]:
def naive_estimators():
    event_log = format_event_log(training_file_path)
    df_naive_predictor_dict = writeGroundtruth(event_log)
    df_naive_predictor_dict = computeTimeDifference(df_naive_predictor_dict)
    df_naive_predictor_result = df_naive_predictor_dict.copy()
    df_naive_predictor_dict = df_naive_predictor_dict.groupby(['event concept:name']).agg(
        naive_prediction_activity = ('ground_truth_activity', pd.Series.mode),
        naive_prediction_time = ('time_until_next_event', 'mean')
    )

    df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_activity='')
    df_naive_predictor_result = df_naive_predictor_result.assign(naive_prediction_time=0)
    for i, r in df_naive_predictor_result.iterrows():
        this_event = r['event concept:name']
        next_event = df_naive_predictor_dict.loc[this_event,'naive_prediction_activity']
        next_event_time = df_naive_predictor_dict.loc[this_event,'naive_prediction_time']
        df_naive_predictor_result.at[i,'naive_prediction_activity'] = next_event
        df_naive_predictor_result.at[i,'naive_prediction_time'] = next_event_time
    df_naive_predictor_result.to_csv('naive_predictor_result.csv')
    # df_naive_predictor_result
    # df_naive_predictor_result
    # df_naive_predictor_dict

## Sprint 2 Estimators
Event Estimator

In [257]:
def sprint_two_estimators():
    naive_predictor_path = "naive_predictor_result.csv"
    event_log = format_event_log(naive_predictor_path)
    traces = split_into_traecs2(event_log)
    prefix_lengths = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
    prefixes = prefix_extraction(traces, prefix_lengths)
    agregation = aggregation_encoding(prefixes, event_log)
    return agregation
     

Time Estimator

## Visualisation

## Error model

In [258]:
def error_model_activity():
    predictor_path = "naive_predictor_result.csv"
    df_error_model = pd.read_csv(predictor_path)
    df_error_model['error_activity'] = df_error_model['ground_truth_activity'] == df_error_model['naive_prediction_activity']
    df_error_model['error_time'] = df_error_model['naive_prediction_time'] - df_error_model['time_until_next_event']
    correct_predictions = df_error_model['error_activity'].value_counts()
    percentage = correct_predictions[True] / len(df_error_model['error_activity']) * 100
    mean_absolute_error = df_error_model['error_time'].mean()
    print(f'Percentage of True values: {percentage:.1f}%')
    print(mean_absolute_error)

## Memory and CPU Usage

In [259]:
def check_memory_cpu() : 
    print('The CPU usage is: ', pu.cpu_percent(4))
    print('RAM memory % used:', pu.virtual_memory()[2])
    print('RAM Used (GB):', pu.virtual_memory()[3]/1000000000)

## Main class
1. All functions that are needed all called in this cell.
2. No other cell runs code other than the main cell.

In [260]:
output = sprint_two_estimators()
output

  event_log = pm.format_dataframe(


KeyError: 1