In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from typing import List, Optional
import matplotlib.pyplot as plt
from typing import Dict, Tuple

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor
from package.processtransformer.constants import Task, Feature_Type


# Initialize data dir, if not exists
if not os.path.exists("datasets"): 
    os.mkdir("datasets")

# Task: Next Activity

In [2]:
class pipeline:
    
    def __init__(self, dataset_name: str, filepath: str, columns: List[str], additional_columns: Optional[Dict[Feature_Type, List[str]]],
                 datetime_format: str, task: Task, model_learning_rate: float, model_epochs: int, model_num_layers: int,
                 train_columns: List[str], target_columns: List[str] ):
        self.dataset_name: str = dataset_name
        self.filepath: str = filepath
        self.columns: List[str] = columns
        self.additional_columns: Optional[Dict[Feature_Type, List[str]]] = additional_columns
        self.datetime_format: str = datetime_format
        self.task: Task = task
        self.model_learning_rate: float = model_learning_rate
        self.model_epochs: int = model_epochs
        self.model_num_layers: int = model_num_layers
        
        self.target_columns: List[str] = target_columns
        for idx, target_col in enumerate(target_columns):
            if target_col == columns[1]:
                self.target_columns[idx] = "concept_name"
                break
                
        self.train_columns: List[str] = train_columns
        for idx, train_col in enumerate(train_columns):
            if train_col == columns[1]:
                self.train_columns[idx] = "concept_name"
                break
        
        # self._model_id: str = (
        #     f"{dataset_name}"
        #     f"##{'#'.join(self.columns)}"
        #     f"##{'#'.join(self.additional_columns)}"
        #     f"##{'#'.join(self.task.value)}"
        #     f"##{self.model_learning_rate}"
        #     f"##{self.model_epochs}"
        #     f"##{self.model_num_layers}")
        
        
    def __str__(self):
        return (
            f"dataset_name: '{self.dataset_name}'\n"
            f"filepath: '{self.filepath}'\n"
            f"columns: '{self.columns}'\n"
            f"additional_columns: '{self.additional_columns}'\n"
            f"datetime_format: '{self.datetime_format}'\n"
            f"task: '{self.task.value}'\n"
            f"Model learning rate: '{self.model_learning_rate}'\n"
            f"Model Epochs: '{self.model_epochs}'\n"
            f"Number of Transformer Layers in Model: '{self.model_num_layers}'\n"
            f"Target columns: '{self.target_columns}'\n"
            f"Train columns: '{self.train_columns}'\n")
        
    
    # preprocess the event log and save the train-test split as csv files
    def preprocess_log(self) -> List[int]:
        data_processor = LogsDataProcessor(
            name=self.dataset_name,
            filepath=self.filepath,
            columns=self.columns,
            additional_columns=self.additional_columns,  # Add all additional columns here, first all categorical, then all numerical features
            datetime_format=self.datetime_format,
            pool=4
        )
        # Preprocess the event log and make train-test split
        data_processor.process_logs(task=self.task, sort_temporally=False)
        
        # TODO: Compute the number of unique classes in each categorical column
        # train_df = pd.read_csv(os.path.join("datasets", self.dataset_name, "processed", f"{self._preprocessing_id}_train.csv"))
        # num_classes_list = data_processor._compute_num_classes(train_df)
        
        # return num_classes_list
    
    
    # load the preprocessed train-test split from the csv files
    def load_data(self) -> Tuple [ LogsDataLoader, pd.DataFrame, pd.DataFrame, Dict[str, int], Dict[str, int], int, int, int ]:
        data_loader = LogsDataLoader(name=self.dataset_name, train_columns=self.train_columns, target_columns=self.target_columns)
        train_df, test_df, word_dict, max_case_length, vocab_size_dict, categorical_features, numerical_features = data_loader.load_data(self.task)
        # num_output = len(y_word_dict[self.target_column])
        num_output = 0
        return data_loader, train_df, test_df, word_dict, max_case_length, vocab_size_dict, num_output, categorical_features, numerical_features
    
    
    def prepare_data(self, data_loader, df):
        print("Preparing data for task next_categorical...")
        # Prepare training examples for next categorical prediction task
        # train_token_x, train_token_y, train_additional_features, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_categorical(
        #     train_df, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
        # )
        token_dict_x, token_dict_y = data_loader.prepare_data(df)
        return token_dict_x, token_dict_y
    
    
    # Prepare data and train the model
    def train(self,
            categorical_features,
            numerical_features,
            train_token_dict_x,
            train_token_dict_y,
            max_case_length: int,
            vocab_size_dict: Dict[str, int],
            num_output: int,
            num_classes_list: List[int]
            ) -> tf.keras.Model:
        
        batch_size = 12
    
        # Define and compile the model
        model = transformer.get_next_categorical_model(
            max_case_length=max_case_length,
            vocab_size_dict=vocab_size_dict,
            output_dim=num_output,
            categorical_features=categorical_features,
            numerical_features=numerical_features,
            num_classes_list=num_classes_list,  # Pass the computed number of classes list
            num_layers=self.model_num_layers
        )
        model.compile(
            optimizer=tf.keras.optimizers.Adam(self.model_learning_rate),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )

        # Train the model
        history = model.fit(train_token_dict_x, train_token_dict_y[self.target_column], epochs=self.model_epochs, batch_size=batch_size)
            
        # Plot training loss
        self._plot_training_loss(history)
        return model
            
            
    # helper function for plotting the training loss
    def _plot_training_loss(self, history):
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'], label='Training Loss')
        if 'val_loss' in history.history:
            plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss Over Epochs')
        plt.legend()
        plt.grid(True)
        plt.show()
        
    def evaluate(self, model, data_loader, train_df, test_df, max_case_length, x_word_dict, y_word_dict):
        print("Evaluating...")
        
        # Prepare lists to store evaluation metrics
        k, accuracies, fscores, precisions, recalls, weights = [], [], [], [], [], []

        # Calculate total number of samples
        total_samples = len(test_df)

        # Iterate over all prefixes (k)
        for i in range(1, max_case_length+1):
            print( "Prefix length: " + str(i) )
            test_data_subset = test_df[test_df["concept_name_prefix-length"] == i]
            if len(test_data_subset) > 0:
                # Calculate weight for this prefix
                weight = len(test_data_subset) / total_samples
                
                # Prepare the test data
                # test_token_x, test_token_y, test_additional_features, _, _ = data_loader.prepare_data_next_categorical(
                #     test_data_subset, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
                # )
                x_token_dict, y_token_dict = data_loader.prepare_data(test_data_subset)
                
                # Make predictions
                y_pred = np.argmax(model.predict(x_token_dict), axis=1)
                # y_pred = np.argmax(model.predict([test_token_x]), axis=1)
                
                # Compute metrics
                accuracy = metrics.accuracy_score(y_token_dict[self.target_column], y_pred)
                precision, recall, fscore, _ = metrics.precision_recall_fscore_support(y_token_dict[self.target_column], y_pred, average="weighted", zero_division=0)
                
                # Store metrics and weight
                k.append(i)
                accuracies.append(accuracy)
                fscores.append(fscore)
                precisions.append(precision)
                recalls.append(recall)
                weights.append(weight)

        # Compute weighted mean metrics over all k
        weighted_accuracy = np.average(accuracies, weights=weights)
        weighted_fscore = np.average(fscores, weights=weights)
        weighted_precision = np.average(precisions, weights=weights)
        weighted_recall = np.average(recalls, weights=weights)

        # Append weighted mean metrics to the lists
        weights.append("")
        k.append("Weighted Mean")
        accuracies.append(weighted_accuracy)
        fscores.append(weighted_fscore)
        precisions.append(weighted_precision)
        recalls.append(weighted_recall)

        # Create a DataFrame to display the results
        results_df = pd.DataFrame({
            'k': k,
            'weight': weights,
            'accuracy': accuracies,
            'fscore': fscores,
            'precision': precisions,
            'recall': recalls
        })

        # Display the results
        print(results_df)

In [3]:
### Helper Functions ###

# helper function: do only preprocessing on data
def preprocess(additional_columns, train_columns, target_columns):
    # initialize pipeline with parameters
    pipe = pipeline(
        dataset_name = "helpdesk",
        filepath = "helpdesk.csv",
        columns = ["Case ID", "Activity", "Complete Timestamp"],
        additional_columns = additional_columns,
        datetime_format = "%Y-%m-%d %H:%M:%S.%f",
        task = constants.Task.NEXT_CATEGORICAL,
        model_learning_rate = 0.001,
        model_epochs = 1,
        model_num_layers = 1,
        target_columns=target_columns,
        train_columns=train_columns)  # Examples: "concept_name", "Resource"
    # preprocess data
    pipe.preprocess_log()


# helper function
def run(additional_columns, train_columns, target_columns):
    # initialize pipeline with parameters
    pipe = pipeline(
        dataset_name = "helpdesk",
        filepath = "helpdesk.csv",
        columns = ["Case ID", "Activity", "Complete Timestamp"],
        additional_columns = additional_columns,
        datetime_format = "%Y-%m-%d %H:%M:%S.%f",
        task = constants.Task.NEXT_CATEGORICAL,
        model_learning_rate = 0.001,
        model_epochs = 10,
        model_num_layers = 1,
        target_columns=target_columns,
        train_columns=train_columns)  # Examples: "concept:name", "Resource"

    # print parameters
    print(pipeline)

    # preprocess data
    pipe.preprocess_log()

    # load data
    data_loader, train_df, test_df, word_dicts = pipe.load_data()

    # prepare data
    train_token_dict_x, train_token_dict_y = pipe.prepare_data(data_loader, train_df)

    # train the model
    model = pipe.train(
                categorical_features = categorical_features,
                numerical_features = numerical_features,
                train_token_dict_x = train_token_dict_x,
                train_token_dict_y = train_token_dict_y,
                max_case_length = max_case_length,
                vocab_size_dict = vocab_size_dict,
                num_output = num_output,
                num_classes_list = num_classes_list)

    # evaluate the model
    pipe.evaluate(model, data_loader, train_df, test_df, max_case_length, x_word_dict, y_word_dict)
    print("")
    print("======================================")
    print("======================================")
    
    
# function for testing out code
def test(additional_columns, train_columns, target_columns):
    # initialize pipeline with parameters
    pipe = pipeline(
        dataset_name = "helpdesk",
        filepath = "helpdesk.csv",
        columns = ["Case ID", "Activity", "Complete Timestamp"],
        additional_columns = additional_columns,
        datetime_format = "%Y-%m-%d %H:%M:%S.%f",
        task = constants.Task.NEXT_CATEGORICAL,
        model_learning_rate = 0.001,
        model_epochs = 10,
        model_num_layers = 1,
        target_columns=target_columns,
        train_columns=train_columns)  # Examples: "concept:name", "Resource"
    
    # print parameters
    print(pipe)
    
    # load data
    data_loader, train_dfs, test_dfs, word_dicts, max_case_length, vocab_size_dict, num_output, categorical_features, numerical_features = pipe.load_data()
    return train_dfs, test_dfs, word_dicts


In [4]:
preprocess(additional_columns={Feature_Type.CATEGORICAL: ["Resource"]}, train_columns=["Resource", "Activity"], target_columns=["Activity"])
# preprocess(additional_columns=["Resource"], target_column="Activity")
# preprocess(additional_columns=["Resource"], target_column="Resource")

# train_dfs, test_dfs, word_dicts = test( additional_columns=["Resource"], train_columns=["Resource", "Activity"], target_columns=["Activity"] )

Parsing Event-Log...
No Processed features found
## COLUMNS ##
['concept_name', 'Resource']
Coding Log Meta-Data...


  0%|          | 0/2 [00:00<?, ?it/s]

Word dictionary for concept_name: {'x_word_dict': {'[PAD]': 0, '[UNK]': 1, 'assign-seriousness': 2, 'take-in-charge-ticket': 3, 'resolve-ticket': 4, 'closed': 5, 'insert-ticket': 6, 'wait': 7, 'create-sw-anomaly': 8, 'require-upgrade': 9, 'verified': 10, 'duplicate': 11, 'resolve-sw-anomaly': 12, 'schedule-intervention': 13, 'resolved': 14, 'invalid': 15}, 'y_word_dict': {'[PAD]': 0, '[UNK]': 1, 'assign-seriousness': 2, 'take-in-charge-ticket': 3, 'resolve-ticket': 4, 'closed': 5, 'insert-ticket': 6, 'wait': 7, 'create-sw-anomaly': 8, 'require-upgrade': 9, 'verified': 10, 'duplicate': 11, 'resolve-sw-anomaly': 12, 'schedule-intervention': 13, 'resolved': 14, 'invalid': 15}}


100%|██████████| 2/2 [00:00<00:00,  9.30it/s]

Word dictionary for Resource: {'x_word_dict': {'[PAD]': 0, '[UNK]': 1, 'Value_1': 2, 'Value_2': 3, 'Value_3': 4, 'Value_4': 5, 'Value_5': 6, 'Value_6': 7, 'Value_7': 8, 'Value_8': 9, 'Value_9': 10, 'Value_10': 11, 'Value_11': 12, 'Value_12': 13, 'Value_13': 14, 'Value_14': 15, 'Value_15': 16, 'Value_16': 17, 'Value_17': 18, 'Value_18': 19, 'Value_19': 20, 'Value_20': 21, 'Value_21': 22, 'Value_22': 23}, 'y_word_dict': {'[PAD]': 0, '[UNK]': 1, 'Value_1': 2, 'Value_2': 3, 'Value_3': 4, 'Value_4': 5, 'Value_5': 6, 'Value_6': 7, 'Value_7': 8, 'Value_8': 9, 'Value_9': 10, 'Value_10': 11, 'Value_11': 12, 'Value_12': 13, 'Value_13': 14, 'Value_14': 15, 'Value_15': 16, 'Value_16': 17, 'Value_17': 18, 'Value_18': 19, 'Value_19': 20, 'Value_20': 21, 'Value_21': 22, 'Value_22': 23}}



  return bound(*args, **kwds)


Preprocessing...


In [None]:
print(word_dicts)

In [None]:
run(additional_columns=[], train_columns=["Activity"] target_columns=["Activity"])

In [None]:
run(additional_columns=["Resource"], train_columns=["Resource", "Activity"] target_columns=["Activity"])

## Preprocessing and Loading

### Helpdesk

In [None]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_ACTIVITY)

### Sepsis

In [None]:
# Dataset processing
dataset_name = "sepsis"
data_processor = LogsDataProcessor(
    name=dataset_name,
    filepath="sepsis.xes",
    columns=["case:concept:name", "concept:name", "time:timestamp"],  # specify the columns name containing case_id, activity name and timestamp
    additional_columns=["org:group"],
    datetime_format="%Y-%m-%d %H:%M:%S%z",
    pool=4
)
data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally=False)

# Garbage collection
del data_processor

## Data Preparation for Training

In [None]:
# Prepare training examples for next activity prediction task
train_token_x, train_token_y, train_additional_features, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_activity(
    train_df, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
)

# Garbage collection
del data_loader

## Model Training

In [None]:
# Model parameters
learning_rate = 0.001
batch_size = 12
epochs = 3

# Define and compile the model
model = transformer.get_next_activity_model(
    max_case_length=max_case_length,
    vocab_size=vocab_size,
    output_dim=num_output,
    num_categorical_features=num_categorical_features,
    num_numerical_features=num_numerical_features,
    num_classes_list=num_classes_list,  # Pass the computed number of classes list
    num_layers=1
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# Train the model
if train_additional_features.shape[1] == 0:
    model.fit([train_token_x], train_token_y, epochs=epochs, batch_size=batch_size)
else:
    model.fit([train_token_x, train_additional_features], train_token_y, epochs=epochs, batch_size=batch_size)

## Evaluation

In [None]:
# Prepare lists to store evaluation metrics
k, accuracies, fscores, precisions, recalls, weights = [], [], [], [], [], []

# Calculate total number of samples
total_samples = len(test_df)

# Iterate over all prefixes (k)
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    if len(test_data_subset) > 0:
        # Calculate weight for this prefix
        weight = len(test_data_subset) / total_samples
        
        # Prepare the test data
        test_token_x, test_token_y, test_additional_features, _, _ = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
        )
        
        # Make predictions
        if test_additional_features.shape[1] != 0:
            y_pred = np.argmax(model.predict([test_token_x, test_additional_features]), axis=1)
        else:
            y_pred = np.argmax(model.predict([test_token_x]), axis=1)
        
        # Compute metrics
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(test_token_y, y_pred, average="weighted")
        
        # Store metrics and weight
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        weights.append(weight)

# Compute weighted mean metrics over all k
weighted_accuracy = np.average(accuracies, weights=weights)
weighted_fscore = np.average(fscores, weights=weights)
weighted_precision = np.average(precisions, weights=weights)
weighted_recall = np.average(recalls, weights=weights)

# Append weighted mean metrics to the lists
k.append(max_case_length)
accuracies.append(weighted_accuracy)
fscores.append(weighted_fscore)
precisions.append(weighted_precision)
recalls.append(weighted_recall)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'k': k,
    'accuracy': accuracies,
    'fscore': fscores,
    'precision': precisions,
    'recall': recalls
})

# Display the results
print(results_df)

In [None]:
print('Average accuracy across all prefixes:', np.mean(accuracies))
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))

# Task: Next Time  -- Ignored for now

## Preprocessing and Loading

### Helpdesk

In [None]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource", "product"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_TIME)

### Sepsis

## Data Preparation for Training

In [None]:
# Prepare training examples for next time prediction task
train_token_x, train_time_x, train_token_y, train_additional_features, time_scaler, y_scaler, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_time(
    train_df, x_word_dict, max_case_length, shuffle=True)

# Garbage collection
del data_loader

# Task: Remaining Time -- Ignored for now

## Preprocessing and Loading

### Helpdesk

### Sepsis

In [None]:
# Process and load data for the next time task
dataset_name = "sepsis"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name, "sepsis.xes", ["case:concept:name", "concept:name", "time:timestamp"], ["org:group"], "%Y-%m-%d %H:%M:%S%z", constants.Task.NEXT_TIME)

## Data Preparation for Training