Ulysses Curiosity offers a group of 10 preconfigured probing tasks for PT-br language, following the original paper [(Conneau et al. 2018)](https://aclanthology.org/P18-1198/). This notebook showcases how to instantiate a preconfigured probing task with a HuggingFace Transformers and a vanilla PyTorch Bidirectional LSTM model.


**Table of Contents**
1. [HuggingFace Transformers example](#HuggingFace-Transformers-example)
2. [Vanilla PyTorch example](#Vanilla-PyTorch-example)

In [1]:
# (1): import needed packages.
import typing as t
import curiosidade
import torch
import torch.nn
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# HuggingFace Transformers example

In [2]:
import transformers
import datasets

In [3]:
# (2): load your pretrained model.
model_name = "neuralmind/bert-base-portuguese-cased"
bert = transformers.BertForTokenClassification.from_pretrained(model_name)
tokenizer = transformers.DistilBertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [4]:
# (3): set up your probing model.
ProbingModel = curiosidade.probers.utils.get_probing_model_for_sequences(hidden_layer_dims=[128])

In [5]:
# (4): set up a preconfigured probing task.
def fn_text_to_tensor_for_huggingface_transformers(
    content: list[str],
    labels: list[int],
    split: t.Literal["train", "eval", "test"],
) -> dict[str, torch.Tensor]:
    n = 2500 # Subsampling to speed up this example.
    content = content[:n]  
    labels = labels[:n]
    
    X = tokenizer(
        content,
        truncation=True,
        padding="max_length",
        max_length=48,
    )
    X["labels"] = labels

    X = datasets.Dataset.from_dict(X)
    X.set_format("torch")

    return X


def accuracy_fn(logits, target):
    _, cls_ids = logits.max(axis=-1)
    return {"accuracy": (cls_ids == target).float().mean().item()}


task = curiosidade.ProbingTaskSentenceLength(
    fn_text_to_tensor=fn_text_to_tensor_for_huggingface_transformers,
    metrics_fn=accuracy_fn,
)

In [6]:
import functools

# (5): set up a ProbingModelFactory, which combines the probing model and the probing task.
probing_factory = curiosidade.ProbingModelFactory(
    probing_model_fn=ProbingModel,  # Note: do not instantiate.
    optim_fn=functools.partial(torch.optim.Adam, lr=0.001),  # Note: do not instantiate.
    task=task,
)

# (6): attach the probing models to the pretrained model layers.
prober_container = curiosidade.attach_probers(
    base_model=bert,
    probing_model_factory=probing_factory,
    modules_to_attach="bert.encoder.layer.0.output.LayerNorm",  # or a container like ["name_a", "name_b", ...]
    random_seed=16,
    device="cpu",
    prune_unrelated_modules="infer",
)

print(f"{prober_container = }")  # Configuration summary.
print(f"{prober_container.probed_modules = }")  # Lists all probed module names.

prober_container = ProbingModelContainer:
(a): Base model: InferencePrunerExtensor(HuggingfaceAdapter(BertForTokenClassification(
 |  |  (bert): BertModel(
 |  |    (embeddings): BertEmbeddings(
 |  |      (word_embeddings): Embedding(29794, 768, padding_idx=0)
 |  |      (position_embeddings): Embedding(512, 768)
 |  |      (token_type_embeddings): Embedding(2, 768)
 |  |      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 |  |      (dropout): Dropout(p=0.1, inplace=False)
 |  |    )
 |  |    (encoder): BertEncoder(
 |  |      (layer): ModuleList(
 |  |        (0): BertLayer(
 |  |          (attention): BertAttention(
 |  |            (self): BertSelfAttention(
 |  |              (query): Linear(in_features=768, out_features=768, bias=True)
 |  |              (key): Linear(in_features=768, out_features=768, bias=True)
 |  |              (value): Linear(in_features=768, out_features=768, bias=True)
 |  |              (dropout): Dropout(p=0.1, inplace=False)
 |  |  

In [7]:
# (7): train probing models.
probing_results = prober_container.train(num_epochs=6, show_progress_bar=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:17<00:00, 12.89s/it]


In [8]:
# (8): aggregate results.
df_train, df_eval, df_test = probing_results.to_pandas(
    aggregate_by=["batch_index"],
    aggregate_fn=[np.mean, np.std],
)

print(df_train)

   epoch metric_name                                 module    metric  \
                                                                 mean   
0      0    accuracy  bert.encoder.layer.0.output.LayerNorm  0.218359   
1      0        loss  bert.encoder.layer.0.output.LayerNorm  1.755534   
2      1    accuracy  bert.encoder.layer.0.output.LayerNorm  0.238741   
3      1        loss  bert.encoder.layer.0.output.LayerNorm  1.685760   
4      2    accuracy  bert.encoder.layer.0.output.LayerNorm  0.308847   
5      2        loss  bert.encoder.layer.0.output.LayerNorm  1.622230   
6      3    accuracy  bert.encoder.layer.0.output.LayerNorm  0.417050   
7      3        loss  bert.encoder.layer.0.output.LayerNorm  1.542018   
8      4    accuracy  bert.encoder.layer.0.output.LayerNorm  0.404527   
9      4        loss  bert.encoder.layer.0.output.LayerNorm  1.465292   
10     5    accuracy  bert.encoder.layer.0.output.LayerNorm  0.500597   
11     5        loss  bert.encoder.layer.0.output.L

# Vanilla PyTorch example

The single difference between the previous example and this example resides in how the `fn_text_to_tensor_for_huggingface_transformers` is specified. More precisely, HF Transformers requires a batch of training samples in the `HuggingFace datasets` format, whereas vanilla PyTorch models generally uses `torch.data.utils.TensorDataset`.

In [9]:
import os
import typing as t

import curiosidade
import torch
import torch.nn
import tokenizers
import buscador
import numpy as np


model_name = "512_hidden_dim_6000_vocab_size_1_layer_lstm"
tokenizer_name = "6000_subword_tokenizer"

buscador.download_resource(
    task_name="legal_text_segmentation",
    resource_name=model_name,
)

buscador.download_resource(
    task_name="legal_text_segmentation",
    resource_name=tokenizer_name,
);

In [10]:
# (2): load your pretrained model.
class LSTMSegmenter(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.embeddings = torch.nn.Embedding(
            num_embeddings=6000,
            embedding_dim=768,
            padding_idx=0,
        )

        self.lstm = torch.nn.LSTM(
            input_size=768,
            hidden_size=512,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )

        self.lin_out = torch.nn.Linear(2 * 512, 4)

    def forward(self, input_ids: torch.Tensor):
        out = input_ids

        out = self.embeddings(out)
        out, *_ = self.lstm(out)
        out = self.lin_out(out)

        return out

    
pretrained_state_dict = torch.load(f"{model_name}.pt")
lstm = LSTMSegmenter()
lstm.load_state_dict(pretrained_state_dict)
tokenizer = tokenizers.Tokenizer.from_file(os.path.join(tokenizer_name, "tokenizer.json"))

# (3): set up your probing model.
ProbingModel = curiosidade.probers.utils.get_probing_model_for_sequences(hidden_layer_dims=[128])

In [11]:
# (4): set up a preconfigured probing task.
def fn_text_to_tensor_for_pytorch(
    content: list[str],
    labels: list[int],
    split: t.Literal["train", "eval", "test"],
) -> dict[str, torch.Tensor]:
    n = 2500 # Subsampling to speed up this example.
    content = content[:n]
    labels = labels[:n]
    
    X = torch.nn.utils.rnn.pad_sequence([
        torch.Tensor(inst.ids)[:48]
        for inst in tokenizer.encode_batch(content)
    ], batch_first=True, padding_value=0.0)
    
    y = torch.Tensor(labels)
    
    X = X.long()
    y = y.long()

    return torch.utils.data.TensorDataset(X, y)


def accuracy_fn(logits, target):
    _, cls_ids = logits.max(axis=-1)
    return {"accuracy": (cls_ids == target).float().mean().item()}


task = curiosidade.ProbingTaskSentenceLength(
    fn_text_to_tensor=fn_text_to_tensor_for_pytorch,
    metrics_fn=accuracy_fn,
)

In [12]:
import functools

# (5): set up a ProbingModelFactory, which combines the probing model and the probing task.
probing_factory = curiosidade.ProbingModelFactory(
    probing_model_fn=ProbingModel,  # Note: do not instantiate.
    optim_fn=functools.partial(torch.optim.Adam, lr=0.001),  # Note: do not instantiate.
    task=task,
)

# (6): attach the probing models to the pretrained model layers.
prober_container = curiosidade.attach_probers(
    base_model=lstm,
    probing_model_factory=probing_factory,
    modules_to_attach="lstm",  # or a container like ["name_a", "name_b", ...]
    random_seed=16,
    device="cuda",
)

print(f"{prober_container = }")  # Configuration summary.
print(f"{prober_container.probed_modules = }")  # Lists all probed module names.

# (7): train probing models.
probing_results = prober_container.train(num_epochs=6, show_progress_bar=True)

prober_container = ProbingModelContainer:
(a): Base model: InferencePrunerExtensor(TorchModuleAdapter(LSTMSegmenter(
 |  |  (embeddings): Embedding(6000, 768, padding_idx=0)
 |  |  (lstm): LSTM(768, 512, batch_first=True, bidirectional=True)
 |  |  (lin_out): Linear(in_features=1024, out_features=4, bias=True)
 |  |)))
 | (a): No pruned modules.
 |
(b): Task name: sentence length (sentlen)
(c): Probing dataset(s):
 | (train):   20 batches of size (at most) 128.
 | (eval) :   10 batches of size (at most) 256.
 | (test) :   10 batches of size (at most) 256.
(d): Probed module(s) (1 in total):
 | (0): lstm
prober_container.probed_modules = ('lstm',)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00,  1.30s/it]


In [13]:
# (8): aggregate results.
df_train, df_eval, df_test = probing_results.to_pandas(
    aggregate_by=["batch_index"],
    aggregate_fn=[np.mean, np.std],
)

print(df_train)

   epoch metric_name module    metric          
                                 mean       std
0      0    accuracy   lstm  0.226723  0.059499
1      0        loss   lstm  1.725622  0.052323
2      1    accuracy   lstm  0.337569  0.054889
3      1        loss   lstm  1.619361  0.041703
4      2    accuracy   lstm  0.395634  0.065277
5      2        loss   lstm  1.495245  0.045395
6      3    accuracy   lstm  0.446415  0.056187
7      3        loss   lstm  1.373625  0.047485
8      4    accuracy   lstm  0.456250  0.055614
9      4        loss   lstm  1.290307  0.054287
10     5    accuracy   lstm  0.497243  0.061581
11     5        loss   lstm  1.211972  0.047279
