# Training Transformers for Next-Activity Prediction

In this repository, there exist three scripts. Fill out this notebook to explain the code within them.

(concept taken from https://arxiv.org/abs/2104.00721)

## Transformer Architecture

(explanation for the layers, type of positional encoding... suggestions of improvements maybe...)

- Optional task: the inclusion of the findings in paper about positional encoding layer when dealing with time-series data: https://link.springer.com/article/10.1007/s10618-023-00948-2

In [2]:
# the code that is talked about
import tensorflow as tf
from tensorflow.keras import layers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_a = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_b = layers.LayerNormalization(epsilon=1e-6)
        self.dropout_a = layers.Dropout(rate)
        self.dropout_b = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout_a(attn_output, training=training)
        out_a = self.layernorm_a(inputs + attn_output)
        ffn_output = self.ffn(out_a)
        ffn_output = self.dropout_b(ffn_output, training=training)
        return self.layernorm_b(out_a + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def get_model(max_case_length, vocab_size, output_dim, 
    embed_dim = 36, num_heads = 4, ff_dim = 64):
    inputs = layers.Input(shape=(max_case_length,))
    x = TokenAndPositionEmbedding(max_case_length, vocab_size, embed_dim)(inputs)
    x = TransformerBlock(embed_dim, num_heads, ff_dim)(x, training=True)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(output_dim, activation="linear")(x)
    transformer = tf.keras.Model(inputs=inputs, outputs=outputs,
        name = "my_transformer")
    return transformer

2024-07-01 15:45:31.547283: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 15:45:31.575858: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 15:45:31.575893: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-01 15:45:31.595167: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Dataset

(What is the dataset, what is the vocabulary size, why is it processed the way it is)

In [3]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import utils

class LogsDataLoader:
    def __init__(self, name, dir_path = "./datasets"):
        """Provides support for reading and 
            pre-processing examples from processed logs.
        Args:
            name: str: name of the dataset as used during processing raw logs
            dir_path: str: Path to dataset directory
        """
        self._dir_path = f"{dir_path}/{name}/processed"

    def prepare_data_next_activity(self, df, 
        x_word_dict, y_word_dict, 
        max_case_length, shuffle=True):
        
        x = df["prefix"].values
        y = df["next_act"].values
        if shuffle:
            x, y = utils.shuffle(x, y)

        token_x = list()
        for _x in x:
            token_x.append([x_word_dict[s] for s in _x.split()])
        # token_x = np.array(token_x, dtype = np.float32)

        token_y = list()
        for _y in y:
            token_y.append(y_word_dict[_y])
        # token_y = np.array(token_y, dtype = np.float32)

        token_x = tf.keras.preprocessing.sequence.pad_sequences(
            token_x, maxlen=max_case_length)

        token_x = np.array(token_x, dtype=np.float32)
        token_y = np.array(token_y, dtype=np.float32)

        return token_x, token_y

    def get_max_case_length(self, train_x):
        train_token_x = list()
        for _x in train_x:
            train_token_x.append(len(_x.split()))
        return max(train_token_x)

    def load_data(self, task):
        train_df = pd.read_csv(f"{self._dir_path}/{task}_train.csv")
        test_df = pd.read_csv(f"{self._dir_path}/{task}_test.csv")

        with open(f"{self._dir_path}/metadata.json", "r") as json_file:
            metadata = json.load(json_file)

        x_word_dict = metadata["x_word_dict"]
        y_word_dict = metadata["y_word_dict"]
        max_case_length = self.get_max_case_length(train_df["prefix"].values)
        vocab_size = len(x_word_dict) 
        total_classes = len(y_word_dict)

        return (train_df, test_df, 
            x_word_dict, y_word_dict, 
            max_case_length, vocab_size, 
            total_classes)


## Training and Using The Model

(model compilation, loss function, accuracy metrics, data loading...)

In [39]:
# the code that is talked about
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics 

import loader
import architecture


dataset="helpdesk"
model_dir="./models"
result_dir="./results"
task = "next_activity"

epochs=10
batch_size=12
learning_rate=0.001
gpu=0
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

if __name__ == "__main__":
    # Create directories to save the results and models
    model_path = f"{model_dir}/{dataset}"
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model_path = f"{model_path}/next_activity_ckpt.weights.h5"

    result_path = f"{result_dir}/{dataset}"
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    result_path = f"{result_path}/results"

    # Load data
    data_loader = loader.LogsDataLoader(name=dataset)

    (train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
        vocab_size, num_output) = data_loader.load_data(task)
    
    # Prepare training examples for next activity prediction task
    train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
        x_word_dict, y_word_dict, max_case_length)
    
    # Create and train a transformer model
    transformer_model = architecture.get_model(
        max_case_length=max_case_length, 
        vocab_size=vocab_size,
        output_dim=num_output)

    transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=model_path,
        save_weights_only=True,
        monitor="sparse_categorical_accuracy",
        mode="max", save_best_only=True)


    transformer_model.fit(train_token_x, train_token_y, 
        epochs=epochs, batch_size=batch_size, 
        shuffle=True, verbose=2, callbacks=[model_checkpoint_callback])

    # Evaluate over all the prefixes (k) and save the results
    k, accuracies,fscores, precisions, recalls = [],[],[],[],[]
    for i in range(max_case_length):
        test_data_subset = test_df[test_df["k"]==i]
        if len(test_data_subset) > 0:
            test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
                x_word_dict, y_word_dict, max_case_length)   
            y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
            accuracy = metrics.accuracy_score(test_token_y, y_pred)
            precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
                test_token_y, y_pred, average="weighted")
            k.append(i)
            accuracies.append(accuracy)
            fscores.append(fscore)
            precisions.append(precision)
            recalls.append(recall)

    k.append(i + 1)
    accuracies.append(np.mean(accuracy))
    fscores.append(np.mean(fscores))
    precisions.append(np.mean(precisions))
    recalls.append(np.mean(recalls))
    print('Average accuracy across all prefixes:', np.mean(accuracies))
    print('Average f-score across all prefixes:', np.mean(fscores))
    print('Average precision across all prefixes:', np.mean(precisions))
    print('Average recall across all prefixes:', np.mean(recalls))    
    results_df = pd.DataFrame({"k":k, "accuracy":accuracies, "fscore": fscores, 
        "precision":precisions, "recall":recalls})
    results_df.to_csv(result_path+"_next_activity.csv", index=False)





Epoch 1/10
1107/1107 - 10s - 9ms/step - loss: 0.7255 - sparse_categorical_accuracy: 0.7679
Epoch 2/10
1107/1107 - 5s - 5ms/step - loss: 0.5834 - sparse_categorical_accuracy: 0.8183
Epoch 3/10
1107/1107 - 6s - 6ms/step - loss: 0.5747 - sparse_categorical_accuracy: 0.8192
Epoch 4/10
1107/1107 - 5s - 4ms/step - loss: 0.5728 - sparse_categorical_accuracy: 0.8191
Epoch 5/10
1107/1107 - 5s - 4ms/step - loss: 0.5716 - sparse_categorical_accuracy: 0.8193
Epoch 6/10
1107/1107 - 5s - 5ms/step - loss: 0.5672 - sparse_categorical_accuracy: 0.8194
Epoch 7/10
1107/1107 - 5s - 5ms/step - loss: 0.5654 - sparse_categorical_accuracy: 0.8178
Epoch 8/10
1107/1107 - 6s - 5ms/step - loss: 0.5627 - sparse_categorical_accuracy: 0.8195
Epoch 9/10
1107/1107 - 5s - 5ms/step - loss: 0.5657 - sparse_categorical_accuracy: 0.8201
Epoch 10/10
1107/1107 - 5s - 4ms/step - loss: 0.5631 - sparse_categorical_accuracy: 0.8193
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m29/29[0m [32m━━━━━━

In [40]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics

import loader
import architecture

dataset = "helpdesk"
model_dir = "./models"
result_dir = "./results"
task = "next_activity"

epochs = 20
batch_size = 16
learning_rate = 0.001
gpu = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

def generate_text(model, seed_text, x_word_dict, max_length):
    """
    Generate text using the trained model and tokenizer.

    Args:
    - model: Trained model to generate text.
    - seed_text: Initial text to start the generation.
    - x_word_dict: Dictionary mapping words to indices.
    - max_length: Maximum length of the generated text.

    Returns:
    - generated_text: Generated text as a string.
    """
    reverse_word_dict = {v: k for k, v in x_word_dict.items()}  # Reverse the dictionary
    seed_words = seed_text.split()

    # Ensure seed_text has valid words
    token_list = [x_word_dict[word] for word in seed_words if word in x_word_dict]
    if not token_list:
        raise ValueError("Seed text contains no valid words from the vocabulary.")
    
    for _ in range(max_length):
        # Pad the token list to the required max_case_length
        padded_token_list = np.pad(token_list, (0, max_case_length - len(token_list)), mode='constant')
        padded_token_list = np.array(padded_token_list).reshape(1, -1)

        # Predict the next word
        predicted = np.argmax(model.predict(padded_token_list), axis=-1)[0]

        # Get the predicted word
        output_word = reverse_word_dict.get(predicted, '')

        # Append the word to the seed text
        seed_words.append(output_word)
        token_list.append(predicted)

        # Stop if end of sentence token is generated
        if output_word == 'endtoken':
            break

    return ' '.join(seed_words)

if __name__ == "__main__":
    # Create directories to save the results and models
    model_path = f"{model_dir}/{dataset}"
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model_path = f"{model_path}/next_activity_ckpt.weights.h5"

    result_path = f"{result_dir}/{dataset}"
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    result_path = f"{result_path}/results"

    # Load data
    data_loader = loader.LogsDataLoader(name=dataset)

    (train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
        vocab_size, num_output) = data_loader.load_data(task)
    
    # Prepare training examples for next activity prediction task
    train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
        x_word_dict, y_word_dict, max_case_length)
    
    # Create and train a transformer model
    transformer_model = architecture.get_model(
        max_case_length=max_case_length, 
        vocab_size=vocab_size,
        output_dim=num_output)

    transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=model_path,
        save_weights_only=True,
        monitor="sparse_categorical_accuracy",
        mode="max", save_best_only=True)

    transformer_model.fit(train_token_x, train_token_y, 
        epochs=epochs, batch_size=batch_size, 
        shuffle=True, verbose=2, callbacks=[model_checkpoint_callback])

    # Load the best model weights
    transformer_model.load_weights(model_path)

    # Generate text using the trained model
    seed_text = "Seriousness assigned"
    max_length = 100
    try:
        generated_text = generate_text(transformer_model, seed_text, x_word_dict, max_length)
        print("Generated Text:\n", generated_text)
    except ValueError as e:
        print(e)

    # Evaluate over all the prefixes (k) and save the results
    k, accuracies, fscores, precisions, recalls = [], [], [], [], []
    for i in range(max_case_length):
        test_data_subset = test_df[test_df["k"] == i]
        if len(test_data_subset) > 0:
            test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
                x_word_dict, y_word_dict, max_case_length)
            y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
            accuracy = metrics.accuracy_score(test_token_y, y_pred)
            precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
                test_token_y, y_pred, average="weighted")
            k.append(i)
            accuracies.append(accuracy)
            fscores.append(fscore)
            precisions.append(precision)
            recalls.append(recall)

    k.append(i + 1)
    accuracies.append(np.mean(accuracy))
    fscores.append(np.mean(fscores))
    precisions.append(np.mean(precisions))
    recalls.append(np.mean(recalls))
    print('Average accuracy across all prefixes:', np.mean(accuracies))
    print('Average f-score across all prefixes:', np.mean(fscores))
    print('Average precision across all prefixes:', np.mean(precisions))
    print('Average recall across all prefixes:', np.mean(recalls))
    results_df = pd.DataFrame({"k": k, "accuracy": accuracies, "fscore": fscores, 
        "precision": precisions, "recall": recalls})
    results_df.to_csv(result_path + "_next_activity.csv", index=False)


Epoch 1/20
830/830 - 8s - 9ms/step - loss: 0.7700 - sparse_categorical_accuracy: 0.7469
Epoch 2/20
830/830 - 4s - 5ms/step - loss: 0.5901 - sparse_categorical_accuracy: 0.8179
Epoch 3/20
830/830 - 4s - 5ms/step - loss: 0.5785 - sparse_categorical_accuracy: 0.8167
Epoch 4/20
830/830 - 4s - 5ms/step - loss: 0.5750 - sparse_categorical_accuracy: 0.8184
Epoch 5/20
830/830 - 5s - 6ms/step - loss: 0.5710 - sparse_categorical_accuracy: 0.8181
Epoch 6/20
830/830 - 6s - 7ms/step - loss: 0.5706 - sparse_categorical_accuracy: 0.8187
Epoch 7/20
830/830 - 10s - 12ms/step - loss: 0.5690 - sparse_categorical_accuracy: 0.8191
Epoch 8/20
830/830 - 4s - 5ms/step - loss: 0.5676 - sparse_categorical_accuracy: 0.8207
Epoch 9/20
830/830 - 4s - 5ms/step - loss: 0.5627 - sparse_categorical_accuracy: 0.8180
Epoch 10/20
830/830 - 8s - 10ms/step - loss: 0.5657 - sparse_categorical_accuracy: 0.8193
Epoch 11/20
830/830 - 4s - 5ms/step - loss: 0.5614 - sparse_categorical_accuracy: 0.8200
Epoch 12/20
830/830 - 4s - 

In [38]:
seed_text = "resolve ticket"
max_length = 100
try:
    generated_text = generate_text(transformer_model, seed_text, x_word_dict, max_length)
    print("Generated Text:\n", generated_text)
except ValueError as e:
    print(e)

Seed text contains no valid words from the vocabulary.


In [33]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn import metrics
import loader
import architecture

# Settings
dataset = "helpdesk"
model_dir = "./models"
task = "next_activity"
gpu = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

def generate_next_activity(model, input_sequence, x_word_dict, max_case_length):
    """
    Predict the next activity given an input sequence using the trained model.

    Args:
    - model: Trained transformer model.
    - input_sequence: List of activities (strings) representing the input sequence.
    - x_word_dict: Dictionary mapping words to indices.
    - max_case_length: Maximum length of the input sequence for padding.

    Returns:
    - next_activity: The predicted next activity as a string.
    """
    reverse_word_dict = {v: k for k, v in x_word_dict.items()}  # Reverse the dictionary
    token_list = [x_word_dict[word] for word in input_sequence if word in x_word_dict]

    if not token_list:
        raise ValueError("Input sequence contains no valid words from the vocabulary.")

    # Pad the token list to the required max_case_length
    padded_token_list = np.pad(token_list, (0, max_case_length - len(token_list)), mode='constant')
    padded_token_list = np.array(padded_token_list).reshape(1, -1)

    # Predict the next word
    predicted = np.argmax(model.predict(padded_token_list), axis=-1)[0]

    # Get the predicted word
    next_activity = reverse_word_dict.get(predicted, '')

    return next_activity

if __name__ == "__main__":
    # Load data
    data_loader = loader.LogsDataLoader(name=dataset)
    (train_df, test_df, x_word_dict, y_word_dict, max_case_length, vocab_size, num_output) = data_loader.load_data(task)

    # Create and load the transformer model
    transformer_model = architecture.get_model(
        max_case_length=max_case_length, 
        vocab_size=vocab_size,
        output_dim=num_output)

    model_path = f"{model_dir}/{dataset}/next_activity_ckpt.weights.h5"
    transformer_model.load_weights(model_path)

    # Example input sequence for inference
    input_sequence = ["Assign seriousness", "Value 1", "Take in charge ticket"]


    # Generate next activity
    try:
        next_activity = generate_next_activity(transformer_model, input_sequence, x_word_dict, max_case_length)
        print("Next Activity:", next_activity)
    except ValueError as e:
        print(e)


Input sequence contains no valid words from the vocabulary.
