<a href="https://colab.research.google.com/github/yuchenj90/playground/blob/master/LoRALayers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset, DatasetDict
from functools import partial
from torch.utils.data import DataLoader
from tqdm import tqdm

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
dataset = load_dataset("imdb")
train_dataset = dataset["train"].shuffle(seed=42).select(range(2048))
test_dataset = dataset["test"].shuffle(seed=42).select(range(1024))
print(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

TASK
1. Implement a LORA layer in pure torch
2. Replace every linear layer in the above model to use a LORA layer instead
3. Run a training job where we only train the lora weights

In [4]:
class LoRALayer(torch.nn.Module):
  def __init__(self, in_dim, out_dim, rank, alpha):
    super().__init__()
    std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
    self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
    self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
    self.alpha = alpha

  def forward(self, x):
    x = self.alpha * (x @ self.A @ self.B)
    return x

class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
      super().__init__()
      self.linear = linear
      self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, x):
      return self.linear(x) + self.lora(x)


In [5]:
lora_r = 8
lora_alpha = 16
assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)
for layer in model.distilbert.transformer.layer:
  layer.attention.q_lin = assign_lora(layer.attention.q_lin)
  layer.attention.k_lin = assign_lora(layer.attention.k_lin)
  layer.attention.v_lin = assign_lora(layer.attention.v_lin)
model.pre_classifier = assign_lora(model.pre_classifier)
model.classifier = assign_lora(model.classifier)


In [6]:
for name, param in model.named_parameters():
  if 'pre_classifier' not in name and 'classifier' not in name:
    param.requires_grad = False

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_function = torch.nn.CrossEntropyLoss()

# For GPUs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader setup
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

model.train()
for epoch in range(3):
  total_loss = 0
  for batch in tqdm(train_loader):
    xx = tokenizer(batch['text'], truncation=True, padding=True, max_length=512, return_tensors='pt')
    xx = {k: v.to(device) for k, v in xx.items()}
    outputs = model(**xx)
    loss = loss_function(outputs.logits, batch["label"].to(device))
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f"Epoch {epoch}: Loss {total_loss/len(train_loader)}")

100%|██████████| 128/128 [00:50<00:00,  2.56it/s]


Epoch 0: Loss 0.3063111072697211


100%|██████████| 128/128 [00:51<00:00,  2.48it/s]


Epoch 1: Loss 0.287555424583843


100%|██████████| 128/128 [00:47<00:00,  2.69it/s]

Epoch 2: Loss 0.2779863146133721



