In [1]:
import sys 
import os

import torch

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Verify the project root path
print("Project root path:", project_root)

# Add the dora_implementation directory to sys.path
sys.path.append(os.path.join(project_root, 'dora_implementation'))


Project root path: /Users/vitorsousa/Documents/Projects/dora-implementation


## Try LoRA

In [3]:
from dora_implementation.model.lora_layer import LoRALayer, LinearWithLoRA, LinearWithLoRAMerged
from dora_implementation.model.mlp import MultiLayerPerceptron, freeze_linear_layers
from dora_implementation.train.train import train
from dora_implementation.eval.evaluation import compute_accuracy
from torch import nn
import torch

In [4]:
## single Linear Layer
torch.manual_seed(123)
layer = nn.Linear(10, 2)
x = torch.randn((1,10))
print("Original output:", layer(x))

Original output: tensor([[0.6639, 0.4487]], grad_fn=<AddmmBackward0>)


In [5]:
## applying LoRA
layer_lora_1 = LinearWithLoRA(layer, rank = 2, alpha= 4)
print("LoRA output:", layer_lora_1(x))

LoRA output: tensor([[0.6639, 0.4487]], grad_fn=<AddBackward0>)


Since we initialized the weight matrix $B$ with zero values in the LoRA layer, the matrix multiplication between $A$ and $B$ result ina matrix consisted of 0's and doesn't affect the original weights.

In [6]:
layer_lora_2 = LinearWithLoRAMerged(layer, rank=2, alpha=4)

print("LoRA output:", layer_lora_2(x))

LoRA output: tensor([[0.6639, 0.4487]], grad_fn=<AddmmBackward0>)


## Applying LoRA Layers
With the implementation using PyTorch modules enable to easily replace a `Linear` layer in an existing neural network with the `LinearWithLoRA` (or `LinearWithLoRAMerged` layers. 

We can implement the multiplayer perceptron as follows:


In [7]:
# Hyperparameters
random_seed = 123
learning_rate = 0.005
num_epochs = 2

# Architecture
num_features = 784
num_hidden_1 = 128
num_hidden_2 = 256
num_classes = 10

In [8]:
model = MultiLayerPerceptron(
    num_features = num_features, 
    num_hidden_1= num_hidden_1, 
    num_hidden_2= num_hidden_2,
    num_classes = num_classes,
)

In [9]:
print(model)

MultiLayerPerceptron(
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)


## MLP with LoRA layers

In [10]:
model.layers[0] = LinearWithLoRA(model.layers[0], rank=4, alpha=8)
model.layers[2] = LinearWithLoRA(model.layers[2], rank=4, alpha=8)
model.layers[4] = LinearWithLoRA(model.layers[4], rank=4, alpha=8)

In [11]:
print(model)

MultiLayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRA(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRA(
      (linear): Linear(in_features=128, out_features=256, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRA(
      (linear): Linear(in_features=256, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)


With this model we can freeze the original `Linear` layers and only make the `LoRALayer` layers trainable as follows:

In [12]:
freeze_linear_layers(model)
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


Bases on the `True`and `False` values we can confirm that onlye th `LoRA` layers are trainable. 

# With an Dataset

In [13]:
import time
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch


In [14]:
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

In [15]:
torch.cuda.is_available()

False

In [16]:
# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64

##########################
### MNIST DATASET
##########################

# Note transforms.ToTensor() scales input images
# to 0-1 range
train_dataset = datasets.MNIST(root='data', 
                               train=True, 
                               transform=transforms.ToTensor(),
                               download=True)

test_dataset = datasets.MNIST(root='data', 
                              train=False, 
                              transform=transforms.ToTensor())


train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=BATCH_SIZE, 
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=BATCH_SIZE, 
                         shuffle=False)

# Checking the dataset
for images, labels in train_loader:  
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


In [17]:
torch.manual_seed(random_seed)
model_pretrained = MultiLayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2, 
    num_classes=num_classes
)

model_pretrained.to(DEVICE)
optimizer_pretrained = torch.optim.Adam(model_pretrained.parameters(), lr=learning_rate)

In [18]:

train(num_epochs, model_pretrained, optimizer_pretrained, train_loader, DEVICE)

Epoch: 001/002 | Batch 000/938 | Loss: 2.2971
Epoch: 001/002 | Batch 400/938 | Loss: 0.2084
Epoch: 001/002 | Batch 800/938 | Loss: 0.1584
Epoch: 001/002 training accuracy: 95.72%
Time elapsed: 0.06 min
Epoch: 002/002 | Batch 000/938 | Loss: 0.0469
Epoch: 002/002 | Batch 400/938 | Loss: 0.0589
Epoch: 002/002 | Batch 800/938 | Loss: 0.0553
Epoch: 002/002 training accuracy: 97.16%
Time elapsed: 0.12 min
Total Training Time: 0.12 min


In [19]:
print(f'Test accuracy orig model: {compute_accuracy(model_pretrained, test_loader, DEVICE):.2f}%')

Test accuracy orig model: 96.82%


## Trained with LoRA

In [20]:
import copy
model_lora = copy.deepcopy(model_pretrained)

In [21]:
model_lora.layers[0] = LinearWithLoRA(model_lora.layers[0], rank=4, alpha=8)
model_lora.layers[2] = LinearWithLoRA(model_lora.layers[2], rank=4, alpha=8)
model_lora.layers[4] = LinearWithLoRA(model_lora.layers[4], rank=4, alpha=8)

model_lora.to(DEVICE)
optimizer_lora = torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
model_lora

MultiLayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRA(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRA(
      (linear): Linear(in_features=128, out_features=256, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRA(
      (linear): Linear(in_features=256, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)

In [22]:
freeze_linear_layers(model_lora)

# Check if linear layers are frozen
for name, param in model_lora.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [23]:
train(num_epochs, model_lora, optimizer_lora, train_loader, DEVICE)
print(f'Test accuracy LoRA finetune: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

Epoch: 001/002 | Batch 000/938 | Loss: 0.1799
Epoch: 001/002 | Batch 400/938 | Loss: 0.1726
Epoch: 001/002 | Batch 800/938 | Loss: 0.0354
Epoch: 001/002 training accuracy: 97.48%
Time elapsed: 0.06 min
Epoch: 002/002 | Batch 000/938 | Loss: 0.2592
Epoch: 002/002 | Batch 400/938 | Loss: 0.1331
Epoch: 002/002 | Batch 800/938 | Loss: 0.0404
Epoch: 002/002 training accuracy: 97.44%
Time elapsed: 0.12 min
Total Training Time: 0.12 min
Test accuracy LoRA finetune: 96.70%


## Trained with DoRA

In [24]:
from dora_implementation.model.dora_layer import LinearWithDoRA, LinearWithDoRAMerged

In [25]:
model_dora = copy.deepcopy(model_pretrained)

In [27]:
model_dora.layers[0] = LinearWithDoRAMerged(model_dora.layers[0], rank=4, alpha=8)
model_dora.layers[2] = LinearWithDoRAMerged(model_dora.layers[2], rank=4, alpha=8)
model_dora.layers[4] = LinearWithDoRAMerged(model_dora.layers[4], rank=4, alpha=8)

model_dora.to(DEVICE)
optimizer_dora = torch.optim.Adam(model_dora.parameters(), lr=learning_rate)
model_dora

MultiLayerPerceptron(
  (layers): Sequential(
    (0): LinearWithDoRAMerged(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithDoRAMerged(
      (linear): Linear(in_features=128, out_features=256, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithDoRAMerged(
      (linear): Linear(in_features=256, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)

In [28]:
print(f'Test accuracy DoRA model: {compute_accuracy(model_dora, test_loader, DEVICE):.2f}%')

Test accuracy DoRA model: 96.82%


In [29]:
freeze_linear_layers(model_dora)

for name, param in model_dora.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.m: True
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.m: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.m: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [30]:
train(num_epochs, model_dora, optimizer_dora, train_loader, DEVICE)
print(f'Test accuracy DoRA finetune: {compute_accuracy(model_dora, test_loader, DEVICE):.2f}%')

Epoch: 001/002 | Batch 000/938 | Loss: 0.1191
Epoch: 001/002 | Batch 400/938 | Loss: 0.1147
Epoch: 001/002 | Batch 800/938 | Loss: 0.1228
Epoch: 001/002 training accuracy: 97.77%
Time elapsed: 0.08 min
Epoch: 002/002 | Batch 000/938 | Loss: 0.0264
Epoch: 002/002 | Batch 400/938 | Loss: 0.0729
Epoch: 002/002 | Batch 800/938 | Loss: 0.1104
Epoch: 002/002 training accuracy: 98.07%
Time elapsed: 0.17 min
Total Training Time: 0.17 min
Test accuracy DoRA finetune: 97.02%
