# Main

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import tensorflow as tf
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

# Get CUDA device information
import pycuda.driver as cuda
import pycuda.autoinit
device = cuda.Device(0)
print("Device Name:", device.name())
print("Total Memory:", device.total_memory() / (1024 ** 2), "MB")
print("Compute Capability:", device.compute_capability())

In [None]:
# Imports
import argparse
import numpy as np
import pandas as pd
from sklearn import metrics 

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor

data_dir = "./datasets/"
if not os.path.exists(data_dir): 
  os.mkdir(data_dir)

# Next Activity

In [None]:
dataset_name = "bpi_2012"
data_processor = LogsDataProcessor(name=dataset_name, filepath="BPI_Challenge_2012.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', datetime_format="ISO8601", pool = 4)
data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_ACTIVITY)

# Prepare training examples for next activity prediction task
train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
    x_word_dict, y_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 1

# Create and train a transformer model
transformer_model = transformer.get_next_activity_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size,
    output_dim=num_output)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

transformer_model.fit(train_token_x, train_token_y, 
    epochs=epochs, batch_size=batch_size)

## Evaluation with equal weighted prefixes (k)

In [None]:
# Evaluate over all the prefixes (k) and save the results
k, accuracies,fscores, precisions, recalls = [],[],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
            x_word_dict, y_word_dict, max_case_length)   
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted")
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

k.append(i + 1)
accuracies.append(np.mean(accuracy))
fscores.append(np.mean(fscores))
precisions.append(np.mean(precisions))
recalls.append(np.mean(recalls))

accuracy = np.mean(accuracies)
print('Average accuracy across all prefixes:', accuracy)
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))

with open(f"{dataset_name}_unweighted.txt", 'w') as file:
    # Write the string to the file
    file.write(str(accuracy))

## Evaluation with weighted prefixes (k)

In [None]:
k, accuracies, fscores, precisions, recalls = [], [], [], [], []
weighted_accuracies, weighted_fscores, weighted_precisions, weighted_recalls = [], [], [], []

total_instances = 0

for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    num_instances = len(test_data_subset)
    
    if num_instances > 0:
        total_instances += num_instances
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length
        )
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted"
        )
        
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        
        weighted_accuracies.append(accuracy * num_instances)
        weighted_fscores.append(fscore * num_instances)
        weighted_precisions.append(precision * num_instances)
        weighted_recalls.append(recall * num_instances)

# Compute weighted averages
average_accuracy = sum(weighted_accuracies) / total_instances
average_fscore = sum(weighted_fscores) / total_instances
average_precision = sum(weighted_precisions) / total_instances
average_recall = sum(weighted_recalls) / total_instances

print('Weighted average accuracy across all prefixes:', average_accuracy)
print('Weighted average f-score across all prefixes:', average_fscore)
print('Weighted average precision across all prefixes:', average_precision)
print('Weighted average recall across all prefixes:', average_recall)

with open(f"{dataset_name}_weighted.txt", 'w') as file:
    # Write the string to the file
    file.write(str(average_accuracy))

# Next Time

In [None]:
dataset_name = "bpi_2012"
data_processor = LogsDataProcessor(name=dataset_name, filepath="BPI_Challenge_2012.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', datetime_format="ISO8601", pool = 4)
data_processor.process_logs(task=constants.Task.NEXT_TIME, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)
(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_TIME)

# Prepare training examples for next activity prediction task
train_token_x, train_time_x, train_token_y, time_scaler, y_scaler = data_loader.prepare_data_next_time(train_df, 
                                                        x_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 100

# Create and train a transformer model
transformer_model = transformer.get_next_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
    epochs=epochs, batch_size=batch_size)

## Evaluation with equal weighted prefixes (k)

In [None]:
################# check the k-values #########################################
# Evaluate over all the prefixes (k) and save the results
k, maes, mses, rmses = [],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

k.append(i + 1)
maes.append(np.mean(maes))
mses.append(np.mean(mses))
rmses.append(np.mean(rmses))  
print('Average MAE across all prefixes:', np.mean(maes))
print('Average MSE across all prefixes:', np.mean(mses))
print('Average RMSE across all prefixes:', np.mean(rmses))


# results_df = pd.DataFrame({"k":k, "mean_absolute_error":maes, 
#     "mean_squared_error":mses, 
#     "root_mean_squared_error":rmses})
# results_df.to_csv(result_path+"_next_time.csv", index=False)

## Evaluation with weighted prefixes (k)

In [None]:
k, maes, mses, rmses = [],[],[],[]
weights = []

for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        num_samples = len(test_data_subset)
        weights.append(num_samples)

        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

# Calculate weighted averages
total_weight = np.sum(weights)
weighted_mae = np.average(maes, weights=weights)
weighted_mse = np.average(mses, weights=weights)
weighted_rmse = np.average(rmses, weights=weights)

k.append(i + 1)
maes.append(weighted_mae)
mses.append(weighted_mse)
rmses.append(weighted_rmse)

print('Weighted MAE across all prefixes:', weighted_mae)
print('Weighted MSE across all prefixes:', weighted_mse)
print('Weighted RMSE across all prefixes:', weighted_rmse)

# Remaining Time

In [None]:
dataset_name = "bpi_2012"
data_processor = LogsDataProcessor(name=dataset_name, filepath="BPI_Challenge_2012.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', datetime_format="ISO8601", pool = 4)
data_processor.process_logs(task=constants.Task.REMAINING_TIME, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)
(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.REMAINING_TIME)

# Prepare training examples for next activity prediction task
train_token_x, train_time_x, train_token_y, time_scaler, y_scaler = data_loader.prepare_data_remaining_time(train_df, 
                                                        x_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 100

# Create and train a transformer model
transformer_model = transformer.get_remaining_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
    epochs=epochs, batch_size=batch_size)

## Evaluation with equal weighted prefixes (k)

In [None]:
################# check the k-values #########################################
# Evaluate over all the prefixes (k) and save the results
k, maes, mses, rmses = [],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_remaining_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

k.append(i + 1)
maes.append(np.mean(maes))
mses.append(np.mean(mses))
rmses.append(np.mean(rmses))  
print('Average MAE across all prefixes:', np.mean(maes))
print('Average MSE across all prefixes:', np.mean(mses))
print('Average RMSE across all prefixes:', np.mean(rmses))


# results_df = pd.DataFrame({"k":k, "mean_absolute_error":maes, 
#     "mean_squared_error":mses, 
#     "root_mean_squared_error":rmses})
# results_df.to_csv(result_path+"_next_time.csv", index=False)

## Evaluation with weighted prefixes (k)

In [None]:
k, maes, mses, rmses = [],[],[],[]
weights = []

for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_remaining_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        num_samples = len(test_data_subset)
        weights.append(num_samples)

        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

# Calculate weighted averages
total_weight = np.sum(weights)
weighted_mae = np.average(maes, weights=weights)
weighted_mse = np.average(mses, weights=weights)
weighted_rmse = np.average(rmses, weights=weights)

k.append(i + 1)
maes.append(weighted_mae)
mses.append(weighted_mse)
rmses.append(weighted_rmse)

print('Weighted MAE across all prefixes:', weighted_mae)
print('Weighted MSE across all prefixes:', weighted_mse)
print('Weighted RMSE across all prefixes:', weighted_rmse)