An implementation of https://arxiv.org/pdf/2106.09685

LoRA, short for low-rank adaptation, introduces matrices of lower rank than that of the original weight matrix, which then are trained while the original weights of a pre-trained model are frozen while fine-tuning on downstream tasks.
We will implement $$h = W_0 x + \Delta W x = W_0 x + BAx$$, where $B \in \mathbb{R}^{d \times r} $, $A \in \mathbb{R}^{r \times k} $, and the rank $r \ll \text{min}(d,k)$, where $k$ is the input dimension (size of the incoming tensor $x$) and $d$ is the output dimension.

In the paper $A$ is initialized with random Gaussian values and zero for $B$. $\Delta Wx$ is scaled by $\frac{\alpha}{r}$, where $\alpha$ is a constant in $r$. 

In [7]:
import torch
import torch.nn as nn


class LoRA(nn.Module):
    def __init__(self, w_0: nn.Module, rank: int, input_dim: int, output_dim: int, alpha: int):
        super().__init__()
        self.A = nn.Parameter(torch.randn(rank, input_dim))
        self.B = nn.Parameter(torch.zeros(output_dim, rank))

        self.w_0 = w_0
        for param in self.w_0.parameters():
            param.requires_grad = False

        self.scaling = alpha / rank


    def forward(self, x: torch.Tensor):
        frozen_output = self.w_0(x)
        low_rank_output = x@self.A.T@self.B.T

        return frozen_output + self.scaling*low_rank_output

In [8]:
BATCH_SIZE=32
d, k = 10, 5
original_layer = nn.Linear(k, d) 
x = torch.randn(BATCH_SIZE, k)

lora_layer = LoRA(original_layer, rank=2, input_dim=k, output_dim=d, alpha=4)

output = lora_layer(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")

Input shape: torch.Size([32, 5])
Output shape: torch.Size([32, 10])
