# Main

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

# Get CUDA device information
import pycuda.driver as cuda
import pycuda.autoinit
device = cuda.Device(0)
print("Device Name:", device.name())
print("Total Memory:", device.total_memory() / (1024 ** 2), "MB")
print("Compute Capability:", device.compute_capability())

2024-10-05 01:16:33.446979: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-05 01:16:33.447036: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-05 01:16:33.447048: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-05 01:16:33.454212: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Device Name: NVIDIA RTX A6000
Total Memory: 48669.75 MB
Compute Capability: (8, 6)


In [2]:
# Imports
import argparse
import numpy as np
import pandas as pd
from sklearn import metrics 

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor

data_dir = "./datasets/"
if not os.path.exists(data_dir): 
  os.mkdir(data_dir)

# Next Activity

In [None]:
dataset_name = "helpdesk"
# data_processor = LogsDataProcessor(name=dataset_name, filepath="sepsis.csv",  
#                                     columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
#                                     dir_path='datasets', datetime_format="%Y-%m-%d %H:%M:%S%z", pool = 4)
# data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_ACTIVITY)

# Prepare training examples for next activity prediction task
train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
    x_word_dict, y_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 100

# Create and train a transformer model
transformer_model = transformer.get_next_activity_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size,
    output_dim=num_output)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

transformer_model.fit(train_token_x, train_token_y, 
    epochs=epochs, batch_size=batch_size)

2024-10-05 01:16:35.762898: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46604 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/100


2024-10-05 01:16:38.338641: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f805e7bc230 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-10-05 01:16:38.338678: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2024-10-05 01:16:38.342916: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-05 01:16:38.357350: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8902
2024-10-05 01:16:38.424935: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
 168/1121 [===>..........................] - ETA: 3s - loss: 0.6405 - sparse_categorical_accuracy: 0.7937

## Evaluation with equal weighted prefixes (k)

In [None]:
# # Evaluate over all the prefixes (k) and save the results
# k, accuracies,fscores, precisions, recalls = [],[],[],[],[]
# for i in range(max_case_length):
#     test_data_subset = test_df[test_df["k"]==i]
#     if len(test_data_subset) > 0:
#         test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
#             x_word_dict, y_word_dict, max_case_length)   
#         y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
#         accuracy = metrics.accuracy_score(test_token_y, y_pred)
#         precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
#             test_token_y, y_pred, average="weighted")
#         k.append(i)
#         accuracies.append(accuracy)
#         fscores.append(fscore)
#         precisions.append(precision)
#         recalls.append(recall)

# k.append(i + 1)
# accuracies.append(np.mean(accuracy))
# fscores.append(np.mean(fscores))
# precisions.append(np.mean(precisions))
# recalls.append(np.mean(recalls))

# print('Average accuracy across all prefixes:', np.mean(accuracies))
# print('Average f-score across all prefixes:', np.mean(fscores))
# print('Average precision across all prefixes:', np.mean(precisions))
# print('Average recall across all prefixes:', np.mean(recalls))

## Evaluation with weighted prefixes (k)

In [None]:
# Initialize lists to store metrics and counts
k_list, accuracies, fscores, precisions, recalls = [], [], [], [], []
weighted_accuracies, weighted_fscores, weighted_precisions, weighted_recalls = [], [], [], []
num_instances_list = []
total_instances = 0

# Loop over each prefix length
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    num_instances = len(test_data_subset)
    k_list.append(i)
    num_instances_list.append(num_instances)
    
    if num_instances > 0:
        total_instances += num_instances
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length
        )
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted"
        )
        
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        
        weighted_accuracies.append(accuracy * num_instances)
        weighted_fscores.append(fscore * num_instances)
        weighted_precisions.append(precision * num_instances)
        weighted_recalls.append(recall * num_instances)
    else:
        # If there are no instances for this prefix, append zeros
        accuracies.append(0)
        fscores.append(0)
        precisions.append(0)
        recalls.append(0)
        weighted_accuracies.append(0)
        weighted_fscores.append(0)
        weighted_precisions.append(0)
        weighted_recalls.append(0)

# Compute weights for each prefix
weights = [n / total_instances if total_instances > 0 else 0 for n in num_instances_list]

# Create a DataFrame with the collected metrics
df = pd.DataFrame({
    'k': k_list,
    'weight': weights,
    'accuracy': accuracies,
    'fscore': fscores,
    'precision': precisions,
    'recall': recalls
})

# Compute weighted average metrics
average_accuracy = sum(weighted_accuracies) / total_instances if total_instances > 0 else 0
average_fscore = sum(weighted_fscores) / total_instances if total_instances > 0 else 0
average_precision = sum(weighted_precisions) / total_instances if total_instances > 0 else 0
average_recall = sum(weighted_recalls) / total_instances if total_instances > 0 else 0

# Append the weighted averages to the DataFrame
weighted_mean_row = {
    'k': 'Weighted Mean',
    'weight': '',
    'accuracy': average_accuracy,
    'fscore': average_fscore,
    'precision': average_precision,
    'recall': average_recall
}
df = pd.concat([df, pd.DataFrame([weighted_mean_row])], ignore_index=True)

# Save the DataFrame to a CSV file
df.to_csv(f"{dataset_name}_next_activity.csv", index=False)

# Print the DataFrame to verify
print(df)

# Next Time

In [None]:
# data_processor = LogsDataProcessor(name=dataset_name, filepath="sepsis.csv",  
#                                     columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
#                                     dir_path='datasets', datetime_format="%Y-%m-%d %H:%M:%S%z", pool = 4)
# data_processor.process_logs(task=constants.Task.NEXT_TIME, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)
(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_TIME)

# Prepare training examples for next activity prediction task
train_token_x, train_time_x, train_token_y, time_scaler, y_scaler = data_loader.prepare_data_next_time(train_df, 
                                                        x_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 100

# Create and train a transformer model
transformer_model = transformer.get_next_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
    epochs=epochs, batch_size=batch_size)

## Evaluation with equal weighted prefixes (k)

In [None]:
# ################# check the k-values #########################################
# # Evaluate over all the prefixes (k) and save the results
# k, maes, mses, rmses = [],[],[],[]
# for i in range(max_case_length):
#     test_data_subset = test_df[test_df["k"]==i]
#     if len(test_data_subset) > 0:
#         test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
#             test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

#         y_pred = transformer_model.predict([test_token_x, test_time_x])
#         _test_y = y_scaler.inverse_transform(test_y)
#         _y_pred = y_scaler.inverse_transform(y_pred)

#         k.append(i)
#         maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
#         mses.append(metrics.mean_squared_error(_test_y, _y_pred))
#         rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

# k.append(i + 1)
# maes.append(np.mean(maes))
# mses.append(np.mean(mses))
# rmses.append(np.mean(rmses))  
# print('Average MAE across all prefixes:', np.mean(maes))
# print('Average MSE across all prefixes:', np.mean(mses))
# print('Average RMSE across all prefixes:', np.mean(rmses))


# # results_df = pd.DataFrame({"k":k, "mean_absolute_error":maes, 
# #     "mean_squared_error":mses, 
# #     "root_mean_squared_error":rmses})
# # results_df.to_csv(result_path+"_next_time.csv", index=False)

## Evaluation with weighted prefixes (k)

In [None]:
# Initialize lists to store metrics and counts
k_list, maes, mses, rmses = [], [], [], []
num_instances_list = []

# Loop over each prefix length
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    num_samples = len(test_data_subset)
    
    if num_samples > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False
        )
        
        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)
        
        mae = metrics.mean_absolute_error(_test_y, _y_pred)
        mse = metrics.mean_squared_error(_test_y, _y_pred)
        rmse = np.sqrt(mse)
        
        k_list.append(i)
        num_instances_list.append(num_samples)
        maes.append(mae)
        mses.append(mse)
        rmses.append(rmse)
    else:
        # If there are no instances for this prefix, append zeros
        k_list.append(i)
        num_instances_list.append(0)
        maes.append(0)
        mses.append(0)
        rmses.append(0)

# Compute weights for each prefix
total_instances = sum(num_instances_list)
weights = [n / total_instances if total_instances > 0 else 0 for n in num_instances_list]

# Compute weighted average metrics
weighted_mae = np.average(maes, weights=num_instances_list) if total_instances > 0 else 0
weighted_mse = np.average(mses, weights=num_instances_list) if total_instances > 0 else 0
weighted_rmse = np.average(rmses, weights=num_instances_list) if total_instances > 0 else 0

# Create a DataFrame with the collected metrics
df = pd.DataFrame({
    'k': k_list,
    'weight': weights,
    'mae': maes,
    'mse': mses,
    'rmse': rmses
})

# Append the weighted averages to the DataFrame
weighted_mean_row = {
    'k': 'Weighted Mean',
    'weight': '',
    'mae': weighted_mae,
    'mse': weighted_mse,
    'rmse': weighted_rmse
}
df = pd.concat([df, pd.DataFrame([weighted_mean_row])], ignore_index=True)

# Save the DataFrame to a CSV file
df.to_csv(f"{dataset_name}_next_time.csv", index=False)

# Print the DataFrame to verify
print(df)

# Remaining Time

In [None]:
# data_processor = LogsDataProcessor(name=dataset_name, filepath="sepsis.csv",  
#                                     columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
#                                     dir_path='datasets', datetime_format="%Y-%m-%d %H:%M:%S%z", pool = 4)
# data_processor.process_logs(task=constants.Task.REMAINING_TIME, sort_temporally= False)

# Load data
data_loader = LogsDataLoader(name = dataset_name)
(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.REMAINING_TIME)

# Prepare training examples for next activity prediction task
train_token_x, train_time_x, train_token_y, time_scaler, y_scaler = data_loader.prepare_data_remaining_time(train_df, 
                                                        x_word_dict, max_case_length)

learning_rate = 0.001
batch_size = 12
epochs = 100

# Create and train a transformer model
transformer_model = transformer.get_remaining_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
    epochs=epochs, batch_size=batch_size)

## Evaluation with equal weighted prefixes (k)

In [None]:
# ################# check the k-values #########################################
# # Evaluate over all the prefixes (k) and save the results
# k, maes, mses, rmses = [],[],[],[]
# for i in range(max_case_length):
#     test_data_subset = test_df[test_df["k"]==i]
#     if len(test_data_subset) > 0:
#         test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_remaining_time(
#             test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

#         y_pred = transformer_model.predict([test_token_x, test_time_x])
#         _test_y = y_scaler.inverse_transform(test_y)
#         _y_pred = y_scaler.inverse_transform(y_pred)

#         k.append(i)
#         maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
#         mses.append(metrics.mean_squared_error(_test_y, _y_pred))
#         rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

# k.append(i + 1)
# maes.append(np.mean(maes))
# mses.append(np.mean(mses))
# rmses.append(np.mean(rmses))  
# print('Average MAE across all prefixes:', np.mean(maes))
# print('Average MSE across all prefixes:', np.mean(mses))
# print('Average RMSE across all prefixes:', np.mean(rmses))


# # results_df = pd.DataFrame({"k":k, "mean_absolute_error":maes, 
# #     "mean_squared_error":mses, 
# #     "root_mean_squared_error":rmses})
# # results_df.to_csv(result_path+"_next_time.csv", index=False)

## Evaluation with weighted prefixes (k)

In [None]:
# Initialize lists to store metrics and counts
k_list, maes, mses, rmses = [], [], [], []
num_instances_list = []

# Loop over each prefix length
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    num_samples = len(test_data_subset)
    
    if num_samples > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_remaining_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False
        )
        
        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)
        
        mae = metrics.mean_absolute_error(_test_y, _y_pred)
        mse = metrics.mean_squared_error(_test_y, _y_pred)
        rmse = np.sqrt(mse)
        
        k_list.append(i)
        num_instances_list.append(num_samples)
        maes.append(mae)
        mses.append(mse)
        rmses.append(rmse)
    else:
        # If there are no instances for this prefix, append zeros
        k_list.append(i)
        num_instances_list.append(0)
        maes.append(0)
        mses.append(0)
        rmses.append(0)

# Compute weights for each prefix
total_instances = sum(num_instances_list)
weights = [n / total_instances if total_instances > 0 else 0 for n in num_instances_list]

# Compute weighted average metrics
weighted_mae = np.average(maes, weights=num_instances_list) if total_instances > 0 else 0
weighted_mse = np.average(mses, weights=num_instances_list) if total_instances > 0 else 0
weighted_rmse = np.average(rmses, weights=num_instances_list) if total_instances > 0 else 0

# Append weighted averages to the lists
k_list.append('Weighted Mean')
weights.append('')
maes.append(weighted_mae)
mses.append(weighted_mse)
rmses.append(weighted_rmse)

# Create a DataFrame with the collected metrics
df = pd.DataFrame({
    'k': k_list,
    'weight': weights,
    'mae': maes,
    'mse': mses,
    'rmse': rmses
})

# Save the DataFrame to a CSV file
df.to_csv(f"{dataset_name}_remaining_time.csv", index=False)

# Print the DataFrame to verify
print(df)