# Main

# Next Activity

In [7]:
# Imports
import os
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics 

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor

data_dir = "./datasets/"
if not os.path.exists(data_dir): 
  os.mkdir(data_dir)
  
dataset_name = "helpdesk"
# data_processor = LogsDataProcessor(name=dataset_name, filepath="helpdesk.csv",  
#                                     columns = ["Case ID", "Activity", "Complete Timestamp"], #specify the columns name containing case_id, activity name and timestamp 
#                                     dir_path='datasets', datetime_format="%Y-%m-%d %H:%M:%S.%f", pool = 4)
# data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_ACTIVITY)

# Prepare training examples for next activity prediction task
train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
    x_word_dict, y_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 5

# Create and train a transformer model
transformer_model = transformer.get_next_activity_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size,
    output_dim=num_output)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

transformer_model.fit(train_token_x, train_token_y, 
    epochs=epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17d721b1e50>

## Evaluation with equal weighted prefixes (k)

In [8]:
# Evaluate over all the prefixes (k) and save the results
k, accuracies,fscores, precisions, recalls = [],[],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
            x_word_dict, y_word_dict, max_case_length)   
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted")
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

k.append(i + 1)
accuracies.append(np.mean(accuracy))
fscores.append(np.mean(fscores))
precisions.append(np.mean(precisions))
recalls.append(np.mean(recalls))

print('Average accuracy across all prefixes:', np.mean(accuracies))
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average accuracy across all prefixes: 0.851619184481714
Average f-score across all prefixes: 0.7961396312085119
Average precision across all prefixes: 0.7988560937711622
Average recall across all prefixes: 0.8392541165218569


## Evaluation with weighted prefixes (k)

In [9]:
k, accuracies, fscores, precisions, recalls = [], [], [], [], []
weighted_accuracies, weighted_fscores, weighted_precisions, weighted_recalls = [], [], [], []

total_instances = 0

for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    num_instances = len(test_data_subset)
    
    if num_instances > 0:
        total_instances += num_instances
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length
        )
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted"
        )
        
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        
        weighted_accuracies.append(accuracy * num_instances)
        weighted_fscores.append(fscore * num_instances)
        weighted_precisions.append(precision * num_instances)
        weighted_recalls.append(recall * num_instances)

# Compute weighted averages
average_accuracy = sum(weighted_accuracies) / total_instances
average_fscore = sum(weighted_fscores) / total_instances
average_precision = sum(weighted_precisions) / total_instances
average_recall = sum(weighted_recalls) / total_instances

print('Weighted average accuracy across all prefixes:', average_accuracy)
print('Weighted average f-score across all prefixes:', average_fscore)
print('Weighted average precision across all prefixes:', average_precision)
print('Weighted average recall across all prefixes:', average_recall)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Weighted average accuracy across all prefixes: 0.7984962406015037
Weighted average f-score across all prefixes: 0.7362141988873097
Weighted average precision across all prefixes: 0.7304172434950408
Weighted average recall across all prefixes: 0.7984962406015037


# Next Time

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics 

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor

data_dir = "./datasets/"
if not os.path.exists(data_dir): 
  os.mkdir(data_dir)
  
  
dataset_name = "helpdesk"
# data_processor = LogsDataProcessor(name=dataset_name, filepath="helpdesk.csv",  
#                                     columns = ["Case ID", "Activity", "Complete Timestamp"], #specify the columns name containing case_id, activity name and timestamp 
#                                     dir_path='datasets', datetime_format="%Y-%m-%d %H:%M:%S.%f", pool = 4)
# data_processor.process_logs(task=constants.Task.NEXT_TIME, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)
(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_TIME)

# Prepare training examples for next activity prediction task
train_token_x, train_time_x, train_token_y, time_scaler, y_scaler = data_loader.prepare_data_next_time(train_df, 
                                                        x_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 3

# Create and train a transformer model
transformer_model = transformer.get_next_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
    epochs=epochs, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17d70b949a0>

## Evaluation with equal weighted prefixes (k)

In [2]:
################# check the k-values #########################################
# Evaluate over all the prefixes (k) and save the results
k, maes, mses, rmses = [],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

k.append(i + 1)
maes.append(np.mean(maes))
mses.append(np.mean(mses))
rmses.append(np.mean(rmses))  
print('Average MAE across all prefixes:', np.mean(maes))
print('Average MSE across all prefixes:', np.mean(mses))
print('Average RMSE across all prefixes:', np.mean(rmses))


# results_df = pd.DataFrame({"k":k, "mean_absolute_error":maes, 
#     "mean_squared_error":mses, 
#     "root_mean_squared_error":rmses})
# results_df.to_csv(result_path+"_next_time.csv", index=False)

Average MAE across all prefixes: 3.0287704
Average MSE across all prefixes: 32.57313
Average RMSE across all prefixes: 5.136947


## Evaluation with weighted prefixes (k)

In [3]:
k, maes, mses, rmses = [],[],[],[]
weights = []

for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        num_samples = len(test_data_subset)
        weights.append(num_samples)

        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

# Calculate weighted averages
total_weight = np.sum(weights)
weighted_mae = np.average(maes, weights=weights)
weighted_mse = np.average(mses, weights=weights)
weighted_rmse = np.average(rmses, weights=weights)

k.append(i + 1)
maes.append(weighted_mae)
mses.append(weighted_mse)
rmses.append(weighted_rmse)

print('Weighted MAE across all prefixes:', weighted_mae)
print('Weighted MSE across all prefixes:', weighted_mse)
print('Weighted RMSE across all prefixes:', weighted_rmse)

Weighted MAE across all prefixes: 4.139796846755072
Weighted MSE across all prefixes: 53.59351683905371
Weighted RMSE across all prefixes: 7.068014570871801
