In [None]:
! pip install transformers datasets

In [65]:
from torch import nn
def depthwise_conv(n_in, n_out, compress_rate, k, stride):
  conv = nn.Conv1d(n_in, n_out//compress_rate, k, stride = stride)
  nn.init.kaiming_normal_(conv.weight)
  return conv
def pointwise_conv(n_in, n_out, compress_rate, k, stride):
  conv = nn.Conv1d(n_out//compress_rate,n_out, 1)
  nn.init.kaiming_normal_(conv.weight)
  return conv

n_in, n_out = 768, 768
compress_rate = 2
k, stride = 1, 1
dropout = 0.8
bottleneck = 32
    

class Adapter(nn.Module):
    def __init__(
            self,
            input_size,
        ):
        super().__init__()

        self.houlsby_adapter = nn.Sequential(
          nn.Linear(n_in, bottleneck),
          nn.GELU(),
          nn.Linear(bottleneck, n_out),
      )
        self.conv_adapter = nn.Sequential(
        depthwise_conv(n_in, n_out, compress_rate,k ,stride),
        pointwise_conv(n_in, n_out, compress_rate,k ,stride),
        nn.Dropout(p=dropout),
        nn.GELU()
      )
    def forward(self, x):
        return self.houlsby_adapter(x)


In [70]:
from transformers import AutoModelForSequenceClassification
from torch import nn
import torch

##vanilla houlsby residual adapter, custom layers
class adapted_bert_layer(nn.Module):
  def __init__(self, bert_layer):
    super().__init__()
    self.bert_layer = bert_layer
    self.adapter = Adapter(768)
  def forward(self, *x):
    bert_out = self.bert_layer(*x)
    return self.adapter(x) + self.bert_layer(x)
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

#add adapter module in a bert model
for idx, layer in enumerate(model_bert.bert.encoder.layer):
  model_bert.bert.encoder.layer[idx] = adapted_bert_layer(model_bert.bert.encoder.layer[idx])

#freeze parameters
for name, param in model_bert.named_parameters():
  if "adapter" in name:
    param.require_grad = True
    print("unfreeze adapter", name)
  else:
    param.require_grad = False


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

unfreeze adapter bert.encoder.layer.0.adapter.houlsby_adapter.0.weight
unfreeze adapter bert.encoder.layer.0.adapter.houlsby_adapter.0.bias
unfreeze adapter bert.encoder.layer.0.adapter.houlsby_adapter.2.weight
unfreeze adapter bert.encoder.layer.0.adapter.houlsby_adapter.2.bias
unfreeze adapter bert.encoder.layer.0.adapter.conv_adapter.0.weight
unfreeze adapter bert.encoder.layer.0.adapter.conv_adapter.0.bias
unfreeze adapter bert.encoder.layer.0.adapter.conv_adapter.1.weight
unfreeze adapter bert.encoder.layer.0.adapter.conv_adapter.1.bias
unfreeze adapter bert.encoder.layer.1.adapter.houlsby_adapter.0.weight
unfreeze adapter bert.encoder.layer.1.adapter.houlsby_adapter.0.bias
unfreeze adapter bert.encoder.layer.1.adapter.houlsby_adapter.2.weight
unfreeze adapter bert.encoder.layer.1.adapter.houlsby_adapter.2.bias
unfreeze adapter bert.encoder.layer.1.adapter.conv_adapter.0.weight
unfreeze adapter bert.encoder.layer.1.adapter.conv_adapter.0.bias
unfreeze adapter bert.encoder.layer.1.

In [71]:
from datasets import load_dataset
raw_datasets = load_dataset("imdb")
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]



  0%|          | 0/3 [00:00<?, ?it/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]



In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer")
from transformers import Trainer
trainer = Trainer(
    model=model_bert, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

In [None]:
trainer.train()

In [21]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Initialize the BERT model and tokenizer
model = BertModelWithAdapters.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
)

# Freeze the parameters of the first two layers
model.encoder.layer[0].requires_grad_(False)
model.encoder.layer[1].requires_grad_(False)

# Define the model_init function to pass to the Trainer
def model_init():
    return model

# Initialize the Trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

AttributeError: ignored

In [22]:
model.bert.encoder.layer

ModuleList(
  (0-11): 12 x BertLayer(
    (attention): BertAttention(
      (self): BertSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (output): BertSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (intermediate): BertIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): BertOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [27]:
getattr(model.bert.encoder.layer[1 - 1], "adapter")

AttributeError: ignored