In [8]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from typing import List, Optional
import matplotlib.pyplot as plt
from typing import Dict, Tuple

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor
from package.processtransformer.constants import Task


# Initialize data dir, if not exists
if not os.path.exists("datasets"): 
    os.mkdir("datasets")

# Task: Next Activity

In [9]:
class next_categorical_pipeline:
    
    def __init__(self, dataset_name: str, filepath: str, columns: List[str], additional_columns: Optional[List[str]],
                 datetime_format: str, task: Task, model_learning_rate: float, model_epochs: int, model_num_layers: int,
                 target_column: str = "concept:name"):
        self.dataset_name: str = dataset_name
        self.filepath: str = filepath
        self.columns: List[str] = columns
        self.additional_columns: Optional[List[str]] = additional_columns
        self.datetime_format: str = datetime_format
        self.task: Task = task
        self.model_learning_rate: float = model_learning_rate
        self.model_epochs: int = model_epochs
        self.model_num_layers: int = model_num_layers
        if target_column == columns[1]:
            self.target_column: str = "concept:name"
        else:
            self.target_column: str = target_column
        
        self._preprocessing_id: str = self.__build_preprocessing_id(dataset_name, task, additional_columns, target_column)
        self._model_id: str = (
            f"{dataset_name}"
            f"##{'#'.join(self.columns)}"
            f"##{'#'.join(self.additional_columns)}"
            f"##{'#'.join(self.task.value)}"
            f"##{self.model_learning_rate}"
            f"##{self.model_epochs}"
            f"##{self.model_num_layers}")
    
    
    # sanitaze the filename and build the preprocessing_id
    def __build_preprocessing_id(self, dataset_name, task, additional_columns, target_column):
        # regex pattern with characters not allowed in file names
        invalid_chars_pattern = r'[\/:*?"<>|]'
        
        # sanitaze dataset_name
        dataset_name = re.sub(invalid_chars_pattern, '_', dataset_name)
        
        # sanitaze additional_columns
        for idx, column in enumerate(additional_columns):
            additional_columns[idx] = re.sub(invalid_chars_pattern, '_', column)
            
        # sanitaze target_column
        target_column = re.sub(invalid_chars_pattern, '_', target_column)
        
        # build the preprocessing_id
        preprocessing_id = f"{dataset_name}##{task.value}##{self.columns[1]}#{'#'.join(additional_columns) if additional_columns else 'No_Additional_Columns'}##"
        
        return preprocessing_id
        
        
    def __str__(self):
        return (
            f"dataset_name: '{self.dataset_name}'\n"
            f"filepath: '{self.filepath}'\n"
            f"columns: '{self.columns}'\n"
            f"additional_columns: '{self.additional_columns}'\n"
            f"datetime_format: '{self.datetime_format}'\n"
            f"task: '{self.task.value}'\n"
            f"Model learning rate: '{self.model_learning_rate}'\n"
            f"Model Epochs: '{self.model_epochs}'\n"
            f"Number of Transformer Layers in Model: '{self.model_num_layers}'\n"
            f"Target column: '{self.target_column}'\n")
        
    
    # preprocess the event log and save the train-test split as csv files
    def preprocess_log(self) -> List[int]:
        data_processor = LogsDataProcessor(
            name=self.dataset_name,
            filepath=self.filepath,
            preprocessing_id = self._preprocessing_id,
            columns=self.columns,
            additional_columns=self.additional_columns,  # Add all additional columns here, first all categorical, then all numerical features
            datetime_format=self.datetime_format,
            pool=4,
            target_column=self.target_column
        )
        # Preprocess the event log and make train-test split
        data_processor.process_logs(task=self.task, sort_temporally=False)
        
        # Compute the number of unique classes in each categorical column
        train_df = pd.read_csv(os.path.join("datasets", self.dataset_name, "processed", f"{self._preprocessing_id}_train.csv"))
        num_classes_list = data_processor._compute_num_classes(train_df)
        
        return num_classes_list
    
    
    # load the preprocessed train-test split from the csv files
    def load_data(self) -> Tuple [ LogsDataLoader, pd.DataFrame, pd.DataFrame, Dict[str, int], Dict[str, int], int, int, int ]:
        data_loader = LogsDataLoader(name=self.dataset_name, preprocessing_id=self._preprocessing_id)
        train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size_dict, num_output, categorical_features, numerical_features = data_loader.load_data(self.task)
        return data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size_dict, num_output, categorical_features, numerical_features
    
    
    def prepare_data(self, data_loader, train_df):
        print("Preparing data for task next_categorical...")
        # Prepare training examples for next categorical prediction task
        # train_token_x, train_token_y, train_additional_features, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_categorical(
        #     train_df, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
        # )
        train_token_dict_x, train_token_dict_y = data_loader.prepare_data(train_df)
        return train_token_dict_x, train_token_dict_y
    
    
    # Prepare data and train the model
    def train(self,
            categorical_features,
            numerical_features,
            train_token_dict_x,
            train_token_dict_y,
            max_case_length: int,
            vocab_size_dict: Dict[str, int],
            num_output: int,
            num_classes_list: List[int]
            ) -> tf.keras.Model:
        
        batch_size = 12
    
        # Define and compile the model
        model = transformer.get_next_categorical_model(
            max_case_length=max_case_length,
            vocab_size_dict=vocab_size_dict,
            output_dim=num_output,
            categorical_features=categorical_features,
            numerical_features=numerical_features,
            num_classes_list=num_classes_list,  # Pass the computed number of classes list
            num_layers=self.model_num_layers
        )
        model.compile(
            optimizer=tf.keras.optimizers.Adam(self.model_learning_rate),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )
        
        # TODO: debugging
        print("train_token_dict_x")
        for key, value in train_token_dict_x.items():
            print(f"key: {key}")
            print(value)
        print()
          
        print("vocab_size_dict")
        for key, value in vocab_size_dict.items():  
            print(f"key: {key}")
            print(value)

        # Train the model
        history = model.fit(train_token_dict_x, train_token_dict_y[self.target_column], epochs=self.model_epochs, batch_size=batch_size)
            
        # Plot training loss
        self._plot_training_loss(history)
        return model
            
            
    # helper function for plotting the training loss
    def _plot_training_loss(self, history):
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'], label='Training Loss')
        if 'val_loss' in history.history:
            plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss Over Epochs')
        plt.legend()
        plt.grid(True)
        plt.show()
        
    def evaluate(self, model, data_loader, train_df, test_df, max_case_length, x_word_dict, y_word_dict):
        print("Evaluating...")
        
        # Prepare lists to store evaluation metrics
        k, accuracies, fscores, precisions, recalls, weights = [], [], [], [], [], []

        # Calculate total number of samples
        total_samples = len(test_df)

        # Iterate over all prefixes (k)
        for i in range(max_case_length):
            print( "Prefix length: " + str(i) )
            test_data_subset = test_df[test_df["k"] == i]
            if len(test_data_subset) > 0:
                # Calculate weight for this prefix
                weight = len(test_data_subset) / total_samples
                
                # Prepare the test data
                test_token_x, test_token_y, test_additional_features, _, _ = data_loader.prepare_data_next_categorical(
                    test_data_subset, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
                )
                
                # Make predictions
                if test_additional_features.shape[1] != 0:
                    y_pred = np.argmax(model.predict([test_token_x, test_additional_features]), axis=1)
                else:
                    y_pred = np.argmax(model.predict([test_token_x]), axis=1)
                
                # Compute metrics
                accuracy = metrics.accuracy_score(test_token_y, y_pred)
                precision, recall, fscore, _ = metrics.precision_recall_fscore_support(test_token_y, y_pred, average="weighted", zero_division=0)
                
                # Store metrics and weight
                k.append(i)
                accuracies.append(accuracy)
                fscores.append(fscore)
                precisions.append(precision)
                recalls.append(recall)
                weights.append(weight)

        # Compute weighted mean metrics over all k
        weighted_accuracy = np.average(accuracies, weights=weights)
        weighted_fscore = np.average(fscores, weights=weights)
        weighted_precision = np.average(precisions, weights=weights)
        weighted_recall = np.average(recalls, weights=weights)

        # Append weighted mean metrics to the lists
        weights.append("")
        k.append("Weighted Mean")
        accuracies.append(weighted_accuracy)
        fscores.append(weighted_fscore)
        precisions.append(weighted_precision)
        recalls.append(weighted_recall)

        # Create a DataFrame to display the results
        results_df = pd.DataFrame({
            'k': k,
            'weight': weights,
            'accuracy': accuracies,
            'fscore': fscores,
            'precision': precisions,
            'recall': recalls
        })

        # Display the results
        print(results_df)

In [10]:
### Helper Functions ###

# helper function: do only preprocessing on data
def preprocess(additional_columns, target_column):
    # initialize pipeline with parameters
    pipeline = next_categorical_pipeline(
        dataset_name = "helpdesk",
        filepath = "helpdesk.csv",
        columns = ["Case ID", "Activity", "Complete Timestamp"],
        additional_columns = additional_columns,
        datetime_format = "%Y-%m-%d %H:%M:%S.%f",
        task = constants.Task.NEXT_CATEGORICAL,
        model_learning_rate = 0.001,
        model_epochs = 1,
        model_num_layers = 1,
        target_column=target_column)  # Examples: "concept:name", "Resource"
    # preprocess data
    pipeline.preprocess_log()


# helper function
def run(additional_columns, target_column):
    print("target_column: " + target_column)

    # initialize pipeline with parameters
    pipeline = next_categorical_pipeline(
        dataset_name = "helpdesk",
        filepath = "helpdesk.csv",
        columns = ["Case ID", "Activity", "Complete Timestamp"],
        additional_columns = additional_columns,
        datetime_format = "%Y-%m-%d %H:%M:%S.%f",
        task = constants.Task.NEXT_CATEGORICAL,
        model_learning_rate = 0.001,
        model_epochs = 10,
        model_num_layers = 1,
        target_column=target_column)  # Examples: "concept:name", "Resource"

    # print parameters
    print(pipeline)

    # preprocess data
    num_classes_list = pipeline.preprocess_log()

    # load data
    data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size_dict, num_output, categorical_features, numerical_features = pipeline.load_data()

    # prepare data
    train_token_dict_x, train_token_dict_y = pipeline.prepare_data(data_loader, train_df)

    # train the model
    model = pipeline.train(
                categorical_features = categorical_features,
                numerical_features = numerical_features,
                train_token_dict_x = train_token_dict_x,
                train_token_dict_y = train_token_dict_y,
                max_case_length = max_case_length,
                vocab_size_dict = vocab_size_dict,
                num_output = num_output,
                num_classes_list = num_classes_list)

    # evaluate the model
    pipeline.evaluate(model, data_loader, train_df, test_df, max_case_length, x_word_dict, y_word_dict)
    print("")
    print("======================================")
    print("======================================")

In [None]:
preprocess(additional_columns=["Resource"], target_column="Activity")
preprocess(additional_columns=["Resource"], target_column="Resource")

In [None]:
number_string = "1 2 3 4 5 6 7 8 9 10"
number_list_str = number_string.split(' ')
number_list = [int(num) for num in number_list_str]
print(number_list)

In [11]:
run(additional_columns=["Resource"], target_column="Activity")

target_column: Activity
dataset_name: 'helpdesk'
filepath: 'helpdesk.csv'
columns: '['Case ID', 'Activity', 'Complete Timestamp']'
additional_columns: '['Resource']'
datetime_format: '%Y-%m-%d %H:%M:%S.%f'
task: 'next_categorical'
Model learning rate: '0.001'
Model Epochs: '10'
Number of Transformer Layers in Model: '1'
Target column: 'concept:name'

Preprocessed train-test split for task next_categorical found. Preprocessing skipped.
Loading data from preprocessed train-test split...
Preparing data for task next_categorical...
y:
[3. 3. 4. ... 3. 4. 5.]
<class 'numpy.ndarray'>
y:
[ 2.  3.  2. ... 10. 10.  6.]
<class 'numpy.ndarray'>
Creating model for task next_categorical...
train_token_dict_x
key: concept:name
[[0. 0. 0. ... 0. 0. 2.]
 [0. 0. 0. ... 0. 2. 3.]
 [0. 0. 0. ... 2. 3. 3.]
 ...
 [0. 0. 0. ... 0. 0. 2.]
 [0. 0. 0. ... 0. 2. 3.]
 [0. 0. 0. ... 2. 3. 4.]]
key: Resource
[[ 0.  0.  0. ...  0.  0.  2.]
 [ 0.  0.  0. ...  0.  2.  2.]
 [ 0.  0.  0. ...  2.  2.  3.]
 ...
 [ 0.  0.

InvalidArgumentError: Graph execution error:

Detected at node 'next_categorical_transformer/token_and_position_embedding_4/embedding_8/embedding_lookup' defined at (most recent call last):
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\kernelapp.py", line 701, in start
      self.io_loop.start()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\asyncio\windows_events.py", line 321, in run_forever
      super().run_forever()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue
      await self.process_one()
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one
      await dispatch(*args)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell
      await result
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request
      reply_content = await reply_content
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute
      res = shell.run_cell(
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\interactiveshell.py", line 3024, in run_cell
      result = self._run_cell(
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\interactiveshell.py", line 3079, in _run_cell
      result = runner(coro)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\interactiveshell.py", line 3284, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\interactiveshell.py", line 3466, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\vince\AppData\Local\Temp\ipykernel_8696\3350013915.py", line 1, in <module>
      run(additional_columns=["Resource"], target_column="Activity")
    File "C:\Users\vince\AppData\Local\Temp\ipykernel_8696\848893064.py", line 51, in run
      model = pipeline.train(
    File "C:\Users\vince\AppData\Local\Temp\ipykernel_8696\956288903.py", line 148, in train
      history = model.fit(train_token_dict_x, train_token_dict_y[self.target_column], epochs=self.model_epochs, batch_size=batch_size)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\OneDrive\Dokumente\Studium-Vincents-Surface\Master\Unterlagen\Master Thesis\Repository\MasterThesis\MultiTaskTransformer\package\processtransformer\models\transformer.py", line 72, in call
      x = self.token_emb(x)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\vince\anaconda3\envs\MasterThesis\lib\site-packages\keras\layers\core\embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'next_categorical_transformer/token_and_position_embedding_4/embedding_8/embedding_lookup'
indices[4,12] = 16 is not in [0, 16)
	 [[{{node next_categorical_transformer/token_and_position_embedding_4/embedding_8/embedding_lookup}}]] [Op:__inference_train_function_16543]

## Preprocessing and Loading

### Helpdesk

In [None]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_ACTIVITY)

### Sepsis

In [None]:
# Dataset processing
dataset_name = "sepsis"
data_processor = LogsDataProcessor(
    name=dataset_name,
    filepath="sepsis.xes",
    columns=["case:concept:name", "concept:name", "time:timestamp"],  # specify the columns name containing case_id, activity name and timestamp
    additional_columns=["org:group"],
    datetime_format="%Y-%m-%d %H:%M:%S%z",
    pool=4
)
data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally=False)

# Garbage collection
del data_processor

## Data Preparation for Training

In [None]:
# Prepare training examples for next activity prediction task
train_token_x, train_token_y, train_additional_features, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_activity(
    train_df, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
)

# Garbage collection
del data_loader

## Model Training

In [None]:
# Model parameters
learning_rate = 0.001
batch_size = 12
epochs = 3

# Define and compile the model
model = transformer.get_next_activity_model(
    max_case_length=max_case_length,
    vocab_size=vocab_size,
    output_dim=num_output,
    num_categorical_features=num_categorical_features,
    num_numerical_features=num_numerical_features,
    num_classes_list=num_classes_list,  # Pass the computed number of classes list
    num_layers=1
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# Train the model
if train_additional_features.shape[1] == 0:
    model.fit([train_token_x], train_token_y, epochs=epochs, batch_size=batch_size)
else:
    model.fit([train_token_x, train_additional_features], train_token_y, epochs=epochs, batch_size=batch_size)

## Evaluation

In [None]:
# Prepare lists to store evaluation metrics
k, accuracies, fscores, precisions, recalls, weights = [], [], [], [], [], []

# Calculate total number of samples
total_samples = len(test_df)

# Iterate over all prefixes (k)
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    if len(test_data_subset) > 0:
        # Calculate weight for this prefix
        weight = len(test_data_subset) / total_samples
        
        # Prepare the test data
        test_token_x, test_token_y, test_additional_features, _, _ = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
        )
        
        # Make predictions
        if test_additional_features.shape[1] != 0:
            y_pred = np.argmax(model.predict([test_token_x, test_additional_features]), axis=1)
        else:
            y_pred = np.argmax(model.predict([test_token_x]), axis=1)
        
        # Compute metrics
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(test_token_y, y_pred, average="weighted")
        
        # Store metrics and weight
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        weights.append(weight)

# Compute weighted mean metrics over all k
weighted_accuracy = np.average(accuracies, weights=weights)
weighted_fscore = np.average(fscores, weights=weights)
weighted_precision = np.average(precisions, weights=weights)
weighted_recall = np.average(recalls, weights=weights)

# Append weighted mean metrics to the lists
k.append(max_case_length)
accuracies.append(weighted_accuracy)
fscores.append(weighted_fscore)
precisions.append(weighted_precision)
recalls.append(weighted_recall)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'k': k,
    'accuracy': accuracies,
    'fscore': fscores,
    'precision': precisions,
    'recall': recalls
})

# Display the results
print(results_df)

In [None]:
print('Average accuracy across all prefixes:', np.mean(accuracies))
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))

# Task: Next Time  -- Ignored for now

## Preprocessing and Loading

### Helpdesk

In [None]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource", "product"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_TIME)

### Sepsis

## Data Preparation for Training

In [None]:
# Prepare training examples for next time prediction task
train_token_x, train_time_x, train_token_y, train_additional_features, time_scaler, y_scaler, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_time(
    train_df, x_word_dict, max_case_length, shuffle=True)

# Garbage collection
del data_loader

# Task: Remaining Time -- Ignored for now

## Preprocessing and Loading

### Helpdesk

### Sepsis

In [None]:
# Process and load data for the next time task
dataset_name = "sepsis"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name, "sepsis.xes", ["case:concept:name", "concept:name", "time:timestamp"], ["org:group"], "%Y-%m-%d %H:%M:%S%z", constants.Task.NEXT_TIME)

## Data Preparation for Training