In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics

from package.processtransformer import constants
from package.processtransformer.models import transformer
from package.processtransformer.data.loader import LogsDataLoader
from package.processtransformer.data.processor import LogsDataProcessor



# Initialize data dir, if not exists
if not os.path.exists("datasets"): 
    os.mkdir("datasets")
    
# Helper function to process and load data
def process_and_load_data(dataset_name, filepath, columns, additional_columns, datetime_format, task):
    data_processor = LogsDataProcessor(
        name=dataset_name,
        filepath=filepath,
        columns=columns,
        additional_columns=additional_columns,  # Add all additional columns here, first all categorical, then all numerical features
        datetime_format=datetime_format,
        pool=4
    )
    data_processor.process_logs(task=task, sort_temporally=False)
    train_df = pd.read_csv(f"datasets/{dataset_name}/processed/{task.value}_train.csv")
    num_classes_list = data_processor._compute_num_classes(train_df)
    del data_processor
    data_loader = LogsDataLoader(name=dataset_name)
    train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output = data_loader.load_data(task)
    return data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list

# Task: Next Activity

## Preprocessing and Loading

### Helpdesk

In [2]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource", "product"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_ACTIVITY)

No preprocessed train-test split for task next_activity found. Preprocessing...
Parsing Event-Log...
Parsing successful.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21348 entries, 0 to 21347
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   case:concept:name  21348 non-null  object        
 1   concept:name       21348 non-null  object        
 2   time:timestamp     21348 non-null  datetime64[ns]
 3   Resource           21348 non-null  object        
 4   product            21348 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 834.0+ KB
None
  case:concept:name           concept:name      time:timestamp Resource  \
0            Case 1     assign-seriousness 2012-10-09 14:50:17  Value 1   
1            Case 1  take-in-charge-ticket 2012-10-09 14:51:01  Value 1   
2            Case 1  take-in-charge-ticket 2012-10-12 15:02:56  Value 2   
3            Case 1 

  return bound(*args, **kwds)


Loading data from preprocessed train-test split...


### Sepsis

In [None]:
# Dataset processing
dataset_name = "sepsis"
data_processor = LogsDataProcessor(
    name=dataset_name,
    filepath="sepsis.xes",
    columns=["case:concept:name", "concept:name", "time:timestamp"],  # specify the columns name containing case_id, activity name and timestamp
    additional_columns=["org:group"],
    datetime_format="%Y-%m-%d %H:%M:%S%z",
    pool=4
)
data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally=False)

# Garbage collection
del data_processor

## Data Preparation for Training

In [3]:
# Prepare training examples for next activity prediction task
train_token_x, train_token_y, train_additional_features, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_activity(
    train_df, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
)

# Garbage collection
del data_loader

Preparing data for task next_activity...


## Model Training

In [4]:
# Model parameters
learning_rate = 0.001
batch_size = 12
epochs = 10

# Define and compile the model
model = transformer.get_next_activity_model(
    max_case_length=max_case_length,
    vocab_size=vocab_size,
    output_dim=num_output,
    num_categorical_features=num_categorical_features,
    num_numerical_features=num_numerical_features,
    num_classes_list=num_classes_list,  # Pass the computed number of classes list
    num_layers=1
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# Train the model
if train_additional_features.shape[1] == 0:
    model.fit([train_token_x], train_token_y, epochs=epochs, batch_size=batch_size)
else:
    model.fit([train_token_x, train_additional_features], train_token_y, epochs=epochs, batch_size=batch_size)

Creating model for task next_activity...
Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluation

In [5]:
# Instantiate the data loader for evaluation
data_loader = LogsDataLoader(name=dataset_name)

# Evaluate over all the prefixes (k) and save the results
k, accuracies, fscores, precisions, recalls = [], [], [], [], []
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"] == i]
    if len(test_data_subset) > 0:
        test_token_x, test_token_y, test_additional_features, _, _ = data_loader.prepare_data_next_activity(
            test_data_subset, x_word_dict, y_word_dict, max_case_length, full_df=pd.concat([train_df, test_df])
        )
        
        y_pred = np.argmax(model.predict([test_token_x, test_additional_features]), axis=1) if test_additional_features.shape[1] != 0 else np.argmax(model.predict([test_token_x]), axis=1)
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(test_token_y, y_pred, average="weighted")
        
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

# Compute mean metrics over all k
mean_accuracy = np.mean(accuracies)
mean_fscore = np.mean(fscores)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)

# Append mean metrics to the lists
k.append(max_case_length)
accuracies.append(mean_accuracy)
fscores.append(mean_fscore)
precisions.append(mean_precision)
recalls.append(mean_recall)

# Display the results
results_df = pd.DataFrame({
    'k': k,
    'accuracy': accuracies,
    'fscore': fscores,
    'precision': precisions,
    'recall': recalls
})

print(results_df)

Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...
Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Preparing data for task next_activity...
     k  accuracy    fscore  precision    recall
0    0  0.836245  0.789133   0.829856  0.836245
1    1  0.664847  0.655426   0.647535  0.664847
2    2  0.813873  0.777961   0.770799  0.813873
3    3  0.826331  0.802044   0.801193  0.826331
4    4  0.801418  0.764404   0.773232  0.801418
5    5  0.855072  0.817633   0.801242  0.855072
6    6  0.727273  0.656412   0.617211  0.727273
7    7  0.615385  0.512821   0.461538  0.615385
8    8  0.571429  0.485714   0.464286  0.571429
9    9  0.750000  0.650000   0.583333  0.750000
10  10  1.000000  1.000000   1.000000  1.000000
11  11  1.000000  1.000000   1.000000  1.000000
12  14  0.788489  0.742629   0.729185  0.788489


In [6]:
print('Average accuracy across all prefixes:', np.mean(accuracies))
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))

Average accuracy across all prefixes: 0.7884893237674948
Average f-score across all prefixes: 0.7426289811324243
Average precision across all prefixes: 0.7291853815016823
Average recall across all prefixes: 0.7884893237674948


# Task: Next Time  -- Ignored for now

## Preprocessing and Loading

### Helpdesk

In [None]:
# Process and load data for the next activity task
dataset_name = "helpdesk"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name = dataset_name,
    filepath = "helpdesk.csv",
    columns = ["Case ID", "Activity", "Complete Timestamp"],
    additional_columns = ["Resource", "product"],
    datetime_format = "%Y-%m-%d %H:%M:%S.%f",
    task = constants.Task.NEXT_TIME)

### Sepsis

## Data Preparation for Training

In [None]:
# Prepare training examples for next time prediction task
train_token_x, train_time_x, train_token_y, train_additional_features, time_scaler, y_scaler, num_categorical_features, num_numerical_features = data_loader.prepare_data_next_time(
    train_df, x_word_dict, max_case_length, shuffle=True)

# Garbage collection
del data_loader

# Task: Remaining Time -- Ignored for now

## Preprocessing and Loading

### Helpdesk

### Sepsis

In [None]:
# Process and load data for the next time task
dataset_name = "sepsis"
data_loader, train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output, num_classes_list = process_and_load_data(
    dataset_name, "sepsis.xes", ["case:concept:name", "concept:name", "time:timestamp"], ["org:group"], "%Y-%m-%d %H:%M:%S%z", constants.Task.NEXT_TIME)

## Data Preparation for Training