## Import

In [33]:
import pandas as pd
import pm4py as pm
import numpy as np
import psutil as pu

from  sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

training_file_path = 'db/BPI_Challenge_2012.XE-training.csv'
testing_file_path = 'db/BPI_Challenge_2012.XE-test.csv'

### load CSV with pandas and pm4py

pm4py gives two extra columns: @@index and @@case_index

In [17]:
def format_event_log(path):
    event_log = pd.read_csv(path, sep=',')
    event_log = pm.format_dataframe(
        event_log, 
        case_id='case concept:name', 
        activity_key='event concept:name', 
        timestamp_key='event time:timestamp'
    )
    return event_log
#event_log

### Helper methods

Function that finds the length of the longest trace in a list of traces

In [18]:
def find_longest_trace(traces):
    max = 0
    for trace in traces:
        if trace.shape[0] > max:
            max = trace.shape[0]
    return max

Function that adds the Ground truth collumns for event and time of the event
- Every value in the ground truth collumns is the actual next event or time of the next evet in the data set
- Used for naive simulator
- Used for verification purposes

In [19]:
# assume sorted by caseID and time
def caseHasNextEvent(df, index):
    if index >= len(df) - 1:
        return False
    if df.loc[index, 'case concept:name'] == df.loc[index+1, 'case concept:name']:
        return True
    return False

def writeGroundtruth(df):
    df = df.sort_values(by=['case concept:name','event time:timestamp'])
    #add new columns containing the name of the next event in the case and the time when it happens
    df = df.assign(ground_truth_activity='')
    df = df.assign(ground_truth_time='')

    for ind in df.index:
        if caseHasNextEvent(df, ind):
            df.at[ind,'ground_truth_activity'] = df.loc[ind+1,'event concept:name']
            df.at[ind,'ground_truth_time'] = df.loc[ind+1, 'event time:timestamp']
        else:
            df.at[ind,'ground_truth_activity'] = None
            df.at[ind,'ground_truth_time'] = None
    return df

#df_event = writeGroundtruth(event_log)
#df_event.to_csv('check_writeGroundtruth_out.csv')

Helper method for naive estimators

In [20]:
def computeTimeDifference(df):
    # df['time_until_next_event'] = 0  # initialize new column with zeros
    df = df.assign(time_until_next_event=0) # initialize new column

    # iterate over each row of the dataframe
    for i, row in df.iterrows():        
        # check if there is a next row with the same case
        if caseHasNextEvent(df, i):
            nextTime = df.loc[i+1, 'event time:timestamp']
            currentTime = row['event time:timestamp']
            timeDiff = nextTime - currentTime
            df.at[i, 'time_until_next_event'] = timeDiff.total_seconds()
    
    return df



Function that splits the data set into separate traces
- Parameter $traces$ is a list containing all the individual traces
- Each trace in traces is a list containing all the events in the trace in the order that they happen
- Helper method for prefix extraction

In [21]:
def split_into_traces(event_log):
    traces = []
    last_location = 0
    for j in range (0, event_log.shape[0] - 1):
        if not event_log["case concept:name"][j] == event_log["case concept:name"][j + 1]:
            traces.append(event_log.loc[last_location : j].reset_index())
            last_location = j + 1
    traces.append(event_log.loc[last_location : event_log.shape[0] - 1].reset_index())
    return traces

Function that deletes traces from a list that contain events past a specified time

In [22]:
def delete_overlaping_traces(traces, end_time):
    del_tr = []
    for j in range(len(traces)):
        for k in traces[j].index:
            trace_ts = traces[j]["event time:timestamp"][k]
        if trace_ts > end_time:
            del_tr.append(j)

    i = 0
    for j in range(len(del_tr)):
        del traces[del_tr[j] - i]
        i = i + 1
    return traces

Prefix extraction
- extract event lists of odd length like 1, 3, 5, 7, 9, 11, 13, 15 from the traces for prediction
- and the ground truth
- store them in a list of prefixes

In [23]:
def prefix_extraction(traces, prefix_lengths):
    prefixes = []
    for trace in traces:
        for length in prefix_lengths:
            if trace.shape[0] > length:
                prefixes.append(trace.loc[:length - 1])
    return prefixes

Agregation encoding
- endoce traces into numerical data for prediction
- adds the ground truth at the end of the encoded prefix

In [24]:
def create_dict(event_log):
    events = []
    events = event_log['event concept:name'].unique()

    events_dict = {'ground_truth' : 'truth'}
    for event in events:
        events_dict[event] = 0
    return events_dict

def aggregation_encoding(prefixes, event_log):
    event_dict = create_dict(event_log)
    aggregation_encoding = pd.DataFrame(event_dict, index=[0])
    aggregation_encoding = aggregation_encoding.drop(0)
    #new_encoding = ['truth', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    index = 0
    for prefix in prefixes:
        current_dict = create_dict(event_log)
        current_dict['ground_truth'] = prefix['ground_truth_activity'][len(prefix.index) - 1]
        for event in prefix["event concept:name"]:
            current_dict[event] = current_dict[event] + 1
        aggregation_encoding.loc[index] = current_dict.values()
        index += 1
    return aggregation_encoding

In [25]:
def get_aggregation_encoding(path, type):
    event_log = format_event_log(path)
    traces = split_into_traces(event_log)
    prefix_lengths = []
    if type == "train":
        traces = delete_overlaping_traces(traces, pd.to_datetime("02-01-2012 15:28:39.244",format="%d-%m-%Y %H:%M:%S.%f"))
        prefix_lengths = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
    if type == "test":
        prefix_lengths = [x for x in range(0, find_longest_trace(traces) + 1)]
    prefixes = prefix_extraction(traces, prefix_lengths)
    aggregation = aggregation_encoding(prefixes, event_log)
    if type == "train":
        aggregation_encoding.to_csv("aggregation_encoding_train.csv")
    if type == "test":
        aggregation_encoding.to_csv("aggregation_encoding_test.csv")
    return aggregation
     

Regression helper methods

In [26]:
def process_X_Y(df):
    dependent_Variable = 'ground_truth'
    independent_Variable = df.columns.tolist()
    independent_Variable.remove(dependent_Variable)
    independent_Variable.remove('Unnamed: 0')

    list_of_activities = independent_Variable.copy()
    dict_activity_to_int = {a: list_of_activities.index(a) for a in list_of_activities}
    
    X = df[independent_Variable]
    Y = df[dependent_Variable].replace(dict_activity_to_int)

    return X, Y

def get_Regression_model_scaler(df_training_encoded):
    X, Y = process_X_Y(df_training_encoded)

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    # X_test = scaler.transform(X_test) #TODO

    model = LogisticRegression()
    model.fit(X, Y)
    
    intercept = model.intercept_

    return model, scaler


## Naive predictor base on mode and mean
1. for each row find next activity of the case and its timestamp
2. compute the time it take for the next event to be log in the db
3. for each activity find the most common next activity (mode)
4. for each activity find the average time between next activity
5. Have 3. and 4. in a DataFrame
6. base on the current activity write the prediction of the next activity and time it will take

In [27]:
def naive_estimators():
    df_train = format_event_log(training_file_path)
    df_train = writeGroundtruth(df_train)
    df_train = computeTimeDifference(df_train)

    df_test = format_event_log(testing_file_path)
    df_test = writeGroundtruth(df_test)


    df_naive_predictor_dict = df_train.groupby(['event concept:name']).agg(
        naive_prediction_activity = ('ground_truth_activity', pd.Series.mode),
        naive_prediction_time = ('time_until_next_event', 'mean')
    )

    df_test = df_test.assign(naive_prediction_activity='')
    df_test = df_test.assign(naive_prediction_time=0)
    for i, r in df_test.iterrows():
        this_event = r['event concept:name']
        next_event = df_naive_predictor_dict.loc[this_event,'naive_prediction_activity']
        next_event_time = df_naive_predictor_dict.loc[this_event,'naive_prediction_time']
        df_test.at[i,'naive_prediction_activity'] = next_event
        df_test.at[i,'naive_prediction_time'] = next_event_time
    df_train.to_csv('db/Ground_Turth.csv')
    df_test.to_csv('db/Navie_Estimator.csv')
    


# Sprint 2 Estimators

### Multivalue Regression 

In [34]:
def regression_estimator():
    df_encoded_train = pd.read_csv('aggregation_encoding_train.csv')
    df_encoded_test = pd.read_csv('db/aggregation_encoding_test.csv')
    
    model, scaler = get_Regression_model_scaler(df_encoded_train)
    X_test, Y_test = process_X_Y(df_encoded_test)
    X_test = scaler.transform(X_test)
    


    predict = model.predict(X_test)




## Event Estimator

In [29]:
def sprint2_predict_event():
    train_file_path = 'db/naive_predictor_result.csv'
    test_file_path = 'db/BPI_Challenge_2012.XE-test.csv'
    train_aggregation = get_aggregation_encoding(train_file_path, "train")
    test_aggregation = get_aggregation_encoding(test_file_path, "test")

## Time Estimator

## Visualisation

## Error model

In [30]:
def error_model_activity():
    predictor_path = "naive_predictor_result.csv"
    df_error_model = pd.read_csv(predictor_path)
    df_error_model['error_activity'] = df_error_model['ground_truth_activity'] == df_error_model['naive_prediction_activity']
    df_error_model['error_time'] = df_error_model['naive_prediction_time'] - df_error_model['time_until_next_event']
    correct_predictions = df_error_model['error_activity'].value_counts()
    percentage = correct_predictions[True] / len(df_error_model['error_activity']) * 100
    mean_absolute_error = df_error_model['error_time'].mean()
    print(f'Percentage of True values: {percentage:.1f}%')
    print(mean_absolute_error)

## Memory and CPU Usage

In [31]:
def check_memory_cpu() : 
    print('The CPU usage is: ', pu.cpu_percent(4))
    print('RAM memory % used:', pu.virtual_memory()[2])
    print('RAM Used (GB):', pu.virtual_memory()[3]/1000000000)

## Main class
1. All functions that are needed all called in this cell.
2. No other cell runs code other than the main cell.

In [36]:

naive_estimators()


  event_log = pm.format_dataframe(
