### Here we assume that the possible categories are fixed, so no LM heads

- `Extract` the brand name from the context
- `predict` the categories from the fixed set of categories

In [1]:
import tensorflow as tf
import re

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# create config to use both gpus on wickerman machine:
# 1. Nvidia A40 (25GB memory allocation)
# 2. Nvidia RTX 3060 (11GB memory allocation)

gpus = tf.config.list_physical_devices('GPU')

tf.config.set_visible_devices(gpus[0], 'GPU') # use Use Nvidia A40 only

log_dev_conf_a40 = tf.config.LogicalDeviceConfiguration(
    memory_limit=45*1024 # 25 GB allocation for a40 GPU
)

tf.config.set_logical_device_configuration(
    gpus[0], # select GPU_0, i.e., Nvidia A40
    [log_dev_conf_a40] # apply 25GB config
)

# tf.config.set_visible_devices(gpus[1], 'GPU') # use RTX3060 only

# log_dev_conf_rtx3060 = tf.config.LogicalDeviceConfiguration(
#     memory_limit=12*1024 # 11 GB allocation for rtx3060 GPU
# )

# tf.config.set_logical_device_configuration(
#     gpus[1], # select GPU_1, i.e., Nvidia rtx3060
#     [log_dev_conf_rtx3060] # apply 11GB config
# )

# MultiGPU setup
# Create a MirroredStrategy.
# strategy = None
# strategy = tf.distribute.MirroredStrategy()
# print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

logical_gpus = tf.config.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")

2024-09-02 23:31:04.347504: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 23:31:04.347530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 23:31:04.348290: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-02 23:31:04.352337: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2 Physical GPUs, 1 Logical GPU


2024-09-02 23:31:05.394560: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-02 23:31:05.395843: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-02 23:31:05.418076: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [2]:
import json
import os
import tensorflow as tf
import numpy as np

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Progbar
from tensorflow.keras.metrics import Mean
from pqdm.threads import pqdm
from tqdm import tqdm

In [3]:
from transformers import TFAutoModelForMaskedLM, AutoTokenizer, TFRobertaModel

model_id = 'roberta-base'

roberta = TFRobertaModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

2024-09-02 23:31:06.598519: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 45.00GiB (48318382080 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch mo

In [4]:
# Define new special tokens
new_special_tokens = {
    'additional_special_tokens': ['<category>', '</category_end>', '<brand>', '</brand_end>']
}

# Add the new special tokens to the tokenizer
tokenizer.add_special_tokens(new_special_tokens)

4

In [5]:
class CustomRobertaClassifier(tf.keras.Model):
    def __init__(self, roberta_model, num_classes):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = roberta_model.roberta  # Use only the Roberta encoder
        self.global_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        # Extract the last hidden state from the Roberta model
        roberta_output = self.roberta(inputs, training=training)[0]  # shape: (batch_size, seq_length, hidden_size)
        pooled_output = self.global_pool(roberta_output)  # shape: (batch_size, hidden_size)
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)  # shape: (batch_size, num_classes)
        return logits

In [6]:
def load_jsonl(path):
    with open(path) as input_file:
        lines = list(map(json.loads, input_file))

    return lines

def load_data(root_path):
    files = os.listdir(root_path)
    dataset= {}

    for filename in files:
        filepath = root_path + f'/{filename}'

        dataset[filename] = load_jsonl(filepath)

    return dataset

In [7]:
dataset = load_data('data/L1')

In [8]:
models = ['llama3.1', 'gemma2:2b', 'gemma2']

test_from = 'attrebute_val'
train_from = 'attrebute_train'
target_from = 'attrebute_test'

train_inputs = train_from + '.data'
train_labels = train_from + '.solution'

test_inputs = test_from + '.data'
test_labels = test_from + '.solution'

target_inputs = target_from + '.data'

test_inputs = dataset[test_inputs]
test_labels = dataset[test_labels]
train_inputs = dataset[train_inputs]
train_labels = dataset[train_labels]
target_inputs = dataset[target_inputs]

In [9]:
import pandas as pd

In [10]:
train_labels_df = pd.DataFrame.from_records(train_labels)
test_labels_df = pd.DataFrame.from_records(test_labels)
L0_unique_labels = pd.concat([train_labels_df, test_labels_df])['L0_category'].unique().tolist()
L1_unique_labels = pd.concat([train_labels_df, test_labels_df])['L1_category'].unique().tolist()
L2_unique_labels = pd.concat([train_labels_df, test_labels_df])['L2_category'].unique().tolist()
L3_unique_labels = pd.concat([train_labels_df, test_labels_df])['L3_category'].unique().tolist()
L4_unique_labels = pd.concat([train_labels_df, test_labels_df])['L4_category'].unique().tolist()

In [11]:
def one_hot_encode(index, k):
    one_hot = np.zeros(k)  # Step 1: Initialize an array of zeros
    one_hot[index-1] = 1     # Step 2: Set the element at the specified index to 1
    return one_hot

In [12]:
columns = ['L0_category', 'L1_category',
       'L2_category', 'L3_category', 'L4_category']

train_category_labels = train_labels_df[columns]
test_category_labels = test_labels_df[columns]

In [13]:
train_category_L0 = np.array(
    [
        one_hot_encode(L0_unique_labels.index(item), len(L0_unique_labels))
    for item in train_category_labels['L0_category']], dtype=np.float32
)
train_category_L1 = np.array(
    [
        one_hot_encode(L1_unique_labels.index(item), len(L1_unique_labels))
    for item in train_category_labels['L1_category']], dtype=np.float32
)
train_category_L2 = np.array(
    [
        one_hot_encode(L2_unique_labels.index(item), len(L2_unique_labels))
    for item in train_category_labels['L2_category']], dtype=np.float32
)
train_category_L3 = np.array(
    [
        one_hot_encode(L3_unique_labels.index(item), len(L3_unique_labels))
    for item in train_category_labels['L3_category']], dtype=np.float32
)
train_category_L4 = np.array(
    [
        one_hot_encode(L4_unique_labels.index(item), len(L4_unique_labels))
    for item in train_category_labels['L4_category']], dtype=np.float32
)

test_category_L0 = np.array(
    [
        one_hot_encode(L0_unique_labels.index(item), len(L0_unique_labels))
    for item in test_category_labels['L0_category']], dtype=np.float32
)
test_category_L1 = np.array(
    [
        one_hot_encode(L1_unique_labels.index(item), len(L1_unique_labels))
    for item in test_category_labels['L1_category']], dtype=np.float32
)
test_category_L2 = np.array(
    [
        one_hot_encode(L2_unique_labels.index(item), len(L2_unique_labels))
    for item in test_category_labels['L2_category']], dtype=np.float32
)
test_category_L3 = np.array(
    [
        one_hot_encode(L3_unique_labels.index(item), len(L3_unique_labels))
    for item in test_category_labels['L3_category']], dtype=np.float32
)
test_category_L4 = np.array(
    [
        one_hot_encode(L4_unique_labels.index(item), len(L4_unique_labels))
    for item in test_category_labels['L4_category']], dtype=np.float32
)

In [14]:
label_length = 25
L0_unique_tokens = tokenizer.batch_encode_plus(L0_unique_labels, return_tensors='np', max_length=label_length, padding='max_length').input_ids
L1_unique_tokens = tokenizer.batch_encode_plus(L1_unique_labels, return_tensors='np', max_length=label_length, padding='max_length').input_ids
L2_unique_tokens = tokenizer.batch_encode_plus(L2_unique_labels, return_tensors='np', max_length=label_length, padding='max_length').input_ids
L3_unique_tokens = tokenizer.batch_encode_plus(L3_unique_labels, return_tensors='np', max_length=label_length, padding='max_length').input_ids
L4_unique_tokens = tokenizer.batch_encode_plus(L4_unique_labels, return_tensors='np', max_length=label_length, padding='max_length').input_ids

In [15]:
L0_unique_embeds = roberta(L0_unique_tokens, return_dict=True)['pooler_output']
L1_unique_embeds = roberta(L1_unique_tokens, return_dict=True)['pooler_output']
L2_unique_embeds = roberta(L2_unique_tokens, return_dict=True)['pooler_output']
L3_unique_embeds = roberta(L3_unique_tokens, return_dict=True)['pooler_output']
L4_unique_embeds = roberta(L4_unique_tokens, return_dict=True)['pooler_output']

2024-09-02 23:31:20.200880: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [16]:
def stack_embeds(k):
    return {
        "L0_embeds": tf.stack([L0_unique_embeds] * k),
        "L1_embeds": tf.stack([L1_unique_embeds] * k),
        "L2_embeds": tf.stack([L2_unique_embeds] * k),
        "L3_embeds": tf.stack([L3_unique_embeds] * k),
        "L4_embeds": tf.stack([L4_unique_embeds] * k),
    }

In [17]:
SEQ_LEN=100
prompt_template="Title:{title}\nStore:{store}\nManufacturer:{details_Manufacturer}"

In [18]:
def preprocess(prompt, l0, l1, l2, l3, l4):
    prompt = prompt.numpy().decode('utf-8')
    tokens = tokenizer.encode_plus(prompt, max_length=SEQ_LEN, padding='max_length', return_tensors='np')
    return (
        tokens.input_ids[0][:SEQ_LEN], tokens.attention_mask[0][:SEQ_LEN], 
        l0, l1, l2, l3, l4)

In [19]:
train_inputs_prompts = pqdm(train_inputs, lambda a: prompt_template.format(**a), n_jobs=12)
test_inputs_prompts = pqdm(test_inputs, lambda a: prompt_template.format(**a), n_jobs=12)

QUEUEING TASKS | :   0%|          | 0/443499 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/443499 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/443499 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/95035 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/95035 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/95035 [00:00<?, ?it/s]

In [20]:
BATCH_SIZE=32
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
    (train_inputs_prompts, 
     train_category_L0, 
     train_category_L1, 
     train_category_L2, 
     train_category_L3, 
     train_category_L4))
    .shuffle(1000)
    .map(
        lambda prompt, l0, l1, l2, l3, l4: tf.py_function(
        preprocess, [prompt, l0, l1, l2, l3, l4], 
        [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32]), 
        num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
)

test_dataset = (
    tf.data.Dataset.from_tensor_slices(
    (test_inputs_prompts, 
     test_category_L0, 
     test_category_L1, 
     test_category_L2, 
     test_category_L3, 
     test_category_L4))
    .shuffle(1000)
    .map(
        lambda prompt, l0, l1, l2, l3, l4: tf.py_function(
        preprocess, [prompt, l0, l1, l2, l3, l4], 
        [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32]), 
        num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
)

In [21]:
del (train_inputs_prompts, 
     train_category_L0, 
     train_category_L1, 
     train_category_L2, 
     train_category_L3, 
     train_category_L4)
del (test_inputs_prompts, 
     test_category_L0, 
     test_category_L1, 
     test_category_L2, 
     test_category_L3, 
     test_category_L4)

In [22]:
num_heads = 12
key_dim = 64
num_cross_attn_blocks = 2

In [23]:
from tensorflow import keras

In [24]:
input_ids = layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype=tf.int32)
attention_mask = layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype=tf.int32)

L0_input = layers.Input(shape=(L0_unique_embeds.shape), name='L0_embeds')
L1_input = layers.Input(shape=(L1_unique_embeds.shape), name='L1_embeds')
L2_input = layers.Input(shape=(L2_unique_embeds.shape), name='L2_embeds')
L3_input = layers.Input(shape=(L3_unique_embeds.shape), name='L3_embeds')
L4_input = layers.Input(shape=(L4_unique_embeds.shape), name='L4_embeds') 

prompt_embeds = roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)['last_hidden_state']

# L0 predictor
# for _ in range(num_cross_attn_blocks):
L0_attn_output = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(L0_input, prompt_embeds)
L0_embeds = keras.layers.LayerNormalization()(L0_attn_output + L0_input)
L0_logits = keras.layers.Dense(1)(L0_embeds)
L0_logits = tf.squeeze(L0_logits, axis=-1)

L1_attn_output = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(L1_input, prompt_embeds)
L1_embeds = keras.layers.LayerNormalization()(L1_attn_output + L1_input)
L1_logits = keras.layers.Dense(1)(L1_embeds)
L1_logits = tf.squeeze(L1_logits, axis=-1)

L2_attn_output = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(L2_input, prompt_embeds)
L2_embeds = keras.layers.LayerNormalization()(L2_attn_output + L2_input)
L2_logits = keras.layers.Dense(1)(L2_embeds)
L2_logits = tf.squeeze(L2_logits, axis=-1)

L3_attn_output = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(L3_input, prompt_embeds)
L3_embeds = keras.layers.LayerNormalization()(L3_attn_output + L3_input)
L3_logits = keras.layers.Dense(1)(L3_embeds)
L3_logits = tf.squeeze(L3_logits, axis=-1)

L4_attn_output = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(L4_input, prompt_embeds)
L4_embeds = keras.layers.LayerNormalization()(L4_attn_output + L4_input)
L4_logits = keras.layers.Dense(1)(L4_embeds)
L4_logits = tf.squeeze(L4_logits, axis=-1)


L0_logits = keras.layers.Softmax(name='L0_preds')(L0_logits)
L1_logits = keras.layers.Softmax(name='L1_preds')(L1_logits)
L2_logits = keras.layers.Softmax(name='L2_preds')(L2_logits)
L3_logits = keras.layers.Softmax(name='L3_preds')(L3_logits)
L4_logits = keras.layers.Softmax(name='L4_preds')(L4_logits)


model = keras.Model(
    inputs=[input_ids, attention_mask, L0_input, L1_input, L2_input, L3_input, L4_input],
    outputs=[L0_logits, L1_logits, L2_logits, L3_logits, L4_logits]
)

In [25]:
def loss_fn(gt, preds):
    losses = tf.convert_to_tensor([keras.losses.categorical_crossentropy(
        gt_item, preds_item
    ) for (gt_item, preds_item) in zip(gt, preds)])

    loss = tf.reduce_sum(losses, axis=1)
    return tf.reduce_mean(loss)

In [26]:
from tensorflow.keras import metrics
from tensorflow.keras.utils import Progbar

In [27]:
optimizer = tf.keras.optimizers.AdamW()
model.compile(
    optimizer=optimizer
)

In [28]:
label_embeds = list(stack_embeds(BATCH_SIZE).values())

# Metrics
train_loss_metric = metrics.Mean(name='train_loss')
train_accuracy_metrics = [
    metrics.CategoricalAccuracy(name=f'train_accuracy_task_{i}') for i in range(5)
]

test_loss_metric = metrics.Mean(name='test_loss')
test_accuracy_metrics = [
    metrics.CategoricalAccuracy(name=f'test_accuracy_task_{i}') for i in range(5)
]

# Checkpointing setup
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, './checkpoints', max_to_keep=5)

@tf.function
def train_step(input_ids, attention_mask, l0, l1, l2, l3, l4):
    with tf.GradientTape() as tape:
        predictions = model([input_ids, attention_mask] + label_embeds, training=True)
        loss = loss_fn(
            [l0, l1, l2, l3, l4],
            predictions
        )
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss_metric.update_state(loss)
    for metric, labels, pred in zip(train_accuracy_metrics, [l0, l1, l2, l3, l4], predictions):
        metric.update_state(labels, pred)

# Testing step
@tf.function
def test_step(input_ids, attention_mask, l0, l1, l2, l3, l4):
    predictions = model([input_ids, attention_mask] + label_embeds, training=False)
    loss = loss_fn(
        [l0, l1, l2, l3, l4],
        predictions
    )
    
    test_loss_metric.update_state(loss)
    for metric, labels, pred in zip(test_accuracy_metrics, [l0, l1, l2, l3, l4], predictions):
        metric.update_state(labels, pred)

# Custom training loop
def train_model(epochs, train_dataset, test_dataset):
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        
        # Training
        progbar = Progbar(target=len(train_dataset), unit_name='step')
        for step, (input_ids, attention_mask, l0, l1, l2, l3, l4) in enumerate(train_dataset):
            train_step(input_ids, attention_mask, l0, l1, l2, l3, l4)
            progbar.update(step + 1, values=[('loss', train_loss_metric.result())] + 
                            [(metric.name, metric.result()) for metric in train_accuracy_metrics])
        
        # Validation
        print("\nValidation:")
        progbar = Progbar(target=len(test_dataset), unit_name='step')
        for step, (input_ids, attention_mask, l0, l1, l2, l3, l4) in enumerate(test_dataset):
            test_step(input_ids, attention_mask, l0, l1, l2, l3, l4)
            progbar.update(step + 1, values=[('loss', test_loss_metric.result())] + 
                            [(metric.name, metric.result()) for metric in test_accuracy_metrics])
        
        print(f'\nEpoch {epoch + 1} Summary:')
        print(f'Train Loss: {train_loss_metric.result():.4f}')
        for metric in train_accuracy_metrics:
            print(f'{metric.name}: {metric.result():.4f}')
        
        print(f'Test Loss: {test_loss_metric.result():.4f}')
        for metric in test_accuracy_metrics:
            print(f'{metric.name}: {metric.result():.4f}\n')
        
        # Checkpointing
        if test_loss_metric.result() < best_test_loss:
            best_test_loss = test_loss_metric.result()
            checkpoint_save_path = checkpoint_manager.save()
            print(f"Best model saved to {checkpoint_save_path} with loss {best_test_loss:.4f}\n")
        
        # Reset metrics at the end of each epoch
        train_loss_metric.reset_states()
        for metric in train_accuracy_metrics:
            metric.reset_states()
        test_loss_metric.reset_states()
        for metric in test_accuracy_metrics:
            metric.reset_states()

In [29]:
train_model(10, train_dataset, test_dataset)

Epoch 1/10


In [None]:
model.save_weights('./v5-approach3/model.h5')