In [1]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Col

In [63]:
from torch import nn
def depthwise_conv(n_in, n_out, compress_rate, k, stride):
  conv = nn.Conv1d(n_in, n_out//compress_rate, k, stride = stride)
  nn.init.kaiming_normal_(conv.weight)
  return conv
def pointwise_conv(n_in, n_out, compress_rate, k, stride):
  conv = nn.Conv1d(n_out//compress_rate,n_out, 1)
  nn.init.kaiming_normal_(conv.weight)
  return conv

n_in, n_out = 768, 768
compress_rate = 2
k, stride = 1, 1
dropout = 0.8
bottleneck = 32
    

class Adapter(nn.Module):
    def __init__(
            self,
            input_size,
        ):
        super().__init__()

        self.houlsby_adapter = nn.Sequential(
          nn.Linear(input_size, bottleneck),
          nn.GELU(),
          nn.Linear(bottleneck, input_size),
      )
      #   self.conv_adapter = nn.Sequential(
      #   depthwise_conv(n_in, n_out, compress_rate,k ,stride),
      #   pointwise_conv(n_in, n_out, compress_rate,k ,stride),
      #   nn.Dropout(p=dropout),
      #   nn.GELU()
      # )
    def forward(self, x):
        #print("input of x", x.shape)
        return self.houlsby_adapter(x)


In [64]:
from transformers import AutoModelForSequenceClassification

from torch import nn
import torch

BertLayerNorm = torch.nn.LayerNorm
##vanilla houlsby residual adapter, custom layers
class adapted_bert_output(nn.Module):
  def __init__(self, BertOutput, config):
    super().__init__()
    self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
    self.bert_output = BertOutput
    self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.adapter = Adapter(config.hidden_size)
  def forward(self,  hidden_states, input_tensor):
    #print(hidden_states.shape)#, x.shape)
    
    hidden_states = self.dense(hidden_states)
    adapter_output = self.adapter(hidden_states)
    hidden_states = self.dropout(hidden_states) + adapter_output
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    
    
    return hidden_states
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

#add adapter module in a bert model
for idx, layer in enumerate(model_bert.bert.encoder.layer):
  model_bert.bert.encoder.layer[idx].output = adapted_bert_output(model_bert.bert.encoder.layer[idx].output,model_bert.config)

#freeze parameters
for name, param in model_bert.named_parameters():
  if "adapter" in name:
    param.require_grad = True
    print("unfreeze adapter", name)
  else:
    param.require_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

unfreeze adapter bert.encoder.layer.0.output.adapter.houlsby_adapter.0.weight
unfreeze adapter bert.encoder.layer.0.output.adapter.houlsby_adapter.0.bias
unfreeze adapter bert.encoder.layer.0.output.adapter.houlsby_adapter.2.weight
unfreeze adapter bert.encoder.layer.0.output.adapter.houlsby_adapter.2.bias
unfreeze adapter bert.encoder.layer.1.output.adapter.houlsby_adapter.0.weight
unfreeze adapter bert.encoder.layer.1.output.adapter.houlsby_adapter.0.bias
unfreeze adapter bert.encoder.layer.1.output.adapter.houlsby_adapter.2.weight
unfreeze adapter bert.encoder.layer.1.output.adapter.houlsby_adapter.2.bias
unfreeze adapter bert.encoder.layer.2.output.adapter.houlsby_adapter.0.weight
unfreeze adapter bert.encoder.layer.2.output.adapter.houlsby_adapter.0.bias
unfreeze adapter bert.encoder.layer.2.output.adapter.houlsby_adapter.2.weight
unfreeze adapter bert.encoder.layer.2.output.adapter.houlsby_adapter.2.bias
unfreeze adapter bert.encoder.layer.3.output.adapter.houlsby_adapter.0.weigh

In [65]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=model_bert, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)
trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=0.6999024251302083, metrics={'train_runtime': 280.9387, 'train_samples_per_second': 10.678, 'train_steps_per_second': 1.335, 'total_flos': 1056031524864000.0, 'train_loss': 0.6999024251302083, 'epoch': 3.0})

In [4]:
from datasets import load_dataset
raw_datasets = load_dataset("imdb")
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=model_bert, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

In [66]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=0.6475082600911458, metrics={'train_runtime': 285.2805, 'train_samples_per_second': 10.516, 'train_steps_per_second': 1.314, 'total_flos': 1056031524864000.0, 'train_loss': 0.6475082600911458, 'epoch': 3.0})

In [None]:
from torch import nn
from transformers.activations import get_activation


class Adapter(nn.Module):
    def __init__(self, dim, r, act):
        super().__init__()
        self.adapter_A = nn.Linear(dim, r)
        self.act = get_activation(act)
        self.adapter_B = nn.Linear(r, dim)

    def forward(self, x, residual):
        result = self.adapter_A(x)
        result = self.act(result)
        result = self.adapter_B(result)
        return result + residual