In [2]:
!pip install transformers datasets torch

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.13.2-py3-none-any.whl (512 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.7/512.7 kB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m111.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting huggingface-hub<1.0,>=0.14.1
  Downloading h

Installing collected packages: tokenizers, xxhash, urllib3, safetensors, regex, pyyaml, pyarrow, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cublas-cu11, multidict, fsspec, frozenlist, filelock, dill, charset-normalizer, asynctest, async-timeout, yarl, requests, nvidia-cudnn-cu11, multiprocess, aiosignal, torch, huggingface-hub, aiohttp, transformers, datasets
Successfully installed aiohttp-3.8.6 aiosignal-1.3.1 async-timeout-4.0.3 asynctest-0.13.0 charset-normalizer-3.4.0 datasets-2.13.2 dill-0.3.6 filelock-3.12.2 frozenlist-1.3.3 fsspec-2023.1.0 huggingface-hub-0.16.4 multidict-6.0.5 multiprocess-0.70.14 nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 pyarrow-12.0.1 pyyaml-6.0.1 regex-2024.4.16 requests-2.31.0 safetensors-0.4.5 tokenizers-0.13.3 torch-1.13.1 transformers-4.30.2 urllib3-2.0.7 xxhash-3.5.0 yarl-1.9.4


In [3]:
import torch
import torch.nn as nn
from transformers import (
    BertConfig,
    BertModel,
    BertForMaskedLM,
    BertTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
class BertConfigRelativePositional(BertConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.position_embedding_type = "relative_key_query"

In [4]:
def rotate_every_two(x):
    x1 = x[:, :, :, ::2]
    x2 = x[:, :, :, 1::2]
    x = torch.stack((-x2, x1), dim=-1)
    return x.flatten(-2)

def apply_rotary_positional_embeddings(x, sinusoidal_pos):
    return (x * sinusoidal_pos.cos()) + (rotate_every_two(x) * sinusoidal_pos.sin())

In [5]:
from transformers.models.bert.modeling_bert import BertAttention, BertLayer, BertEncoder

class BertAttentionWithRotary(BertAttention):
    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
        # Original attention code here
        # Apply rotary positional embeddings before attention calculation
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        sinusoidal_pos = self.get_sinusoidal_pos(hidden_states)
        query_layer = apply_rotary_positional_embeddings(query_layer, sinusoidal_pos)
        key_layer = apply_rotary_positional_embeddings(key_layer, sinusoidal_pos)
        # Continue with the original attention computation
        # ...

class BertLayerCustom(BertLayer):
    def __init__(self, config):
        super().__init__(config)
        if config.position_embedding_type == "rotary":
            self.attention = BertAttentionWithRotary(config)

In [6]:
from transformers.models.bert.modeling_bert import BertModel

class BertModelCustom(BertModel):
    def __init__(self, config):
        super().__init__(config)
        if config.position_embedding_type in ["relative_key_query", "rotary"]:
            self.encoder = BertEncoderCustom(config)

class BertEncoderCustom(BertEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.layer = nn.ModuleList([BertLayerCustom(config) for _ in range(config.num_hidden_layers)])


In [7]:
# Load a small dataset for demonstration purposes
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

In [10]:
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=512,
)

# Choose the type of positional embeddings
config.position_embedding_type = 'absolute'  # Options: 'absolute', 'relative_key_query', 'rotary'

model = BertForMaskedLM(config)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [11]:
# Freeze embeddings and first two layers
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

for idx, layer in enumerate(model.bert.encoder.layer):
    if idx < 2:
        for param in layer.parameters():
            param.requires_grad = False

In [12]:
# Modify the forward method to output hidden states
config.output_hidden_states = True

# After training, get outputs and probe layers
def probe_model(input_ids):
    with torch.no_grad():
        outputs = model(input_ids)
        hidden_states = outputs.hidden_states  # Tuple of hidden states from all layers
    return hidden_states

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 84, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1507, in forward
    prediction_scores = self.cls(sequence_output)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 798, in forward
    prediction_scores = self.predictions(sequence_output)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 788, in forward
    hidden_states = self.decoder(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/I6356345/anaconda3/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 117, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 450.00 MiB. GPU 0 has a total capacity of 23.46 GiB of which 187.88 MiB is free. Process 105361 has 1.88 GiB memory in use. Process 546825 has 1.27 GiB memory in use. Process 1005137 has 1004.00 MiB memory in use. Process 1059056 has 988.00 MiB memory in use. Process 1218947 has 17.56 GiB memory in use. Including non-PyTorch memory, this process has 634.00 MiB memory in use. Of the allocated memory 354.20 MiB is allocated by PyTorch, and 15.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [14]:
# Get a batch of data
sample = next(iter(tokenized_dataset))
input_ids = torch.tensor([sample['input_ids']])

# Probe the model
hidden_states = probe_model(input_ids)

# Analyze the hidden states from different layers
for idx, layer_hidden_state in enumerate(hidden_states):
    print(f"Layer {idx}: {layer_hidden_state.shape}")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)