<a href="https://colab.research.google.com/github/ekkiprop/llms/blob/main/03_Godoy_FT_Low_Rank_Adaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bitsandbytes
import numpy as np
import torch
import torch.nn as nn
from copy import deepcopy
from numpy.linalg import matrix_rank
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, AutoConfig
from transformers.integrations.bitsandbytes import get_keys_to_not_convert
from bitsandbytes.nn import Linear8bitLt, Linear4bit, LinearFP4,  LinearNF4


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch~=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch~=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch~=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch~=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch~=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.

In [2]:
base_layer = nn.Linear(1024, 1024, bias = False)
base_layer.weight.shape, base_layer.weight.numel()

(torch.Size([1024, 1024]), 1048576)

In [3]:
torch.manual_seed(11)
r = 8
layer_A = nn.Linear(base_layer.in_features, r, bias=False)
layer_B = nn.Linear(r, base_layer.out_features, bias=False)
layer_A, layer_B

(Linear(in_features=1024, out_features=8, bias=False),
 Linear(in_features=8, out_features=1024, bias=False))

In [4]:

layer_A.weight.numel(), layer_B.weight.numel()

(8192, 8192)

In [5]:
composite = layer_B.weight @ layer_A.weight

In [6]:
composite.shape, composite.numel()

(torch.Size([1024, 1024]), 1048576)

In [7]:
matrix_rank(composite.detach().numpy())

8

In [8]:
ther=torch.linalg.matrix_rank(composite.detach())
ther

tensor(8)

In [9]:
matrix_rank(composite.detach())

8

In [10]:
torch.manual_seed(19)
batch = torch.randn(1, 1024)
batch @ (base_layer.weight.data + layer_B.weight @ layer_A.weight).T

tensor([[ 0.2958, -0.1800, -0.3731,  ..., -0.1412,  0.7358,  0.4212]],
       grad_fn=<MmBackward0>)

In [11]:
base_layer.weight.data.shape

torch.Size([1024, 1024])

In [13]:
regular_output = batch @ base_layer.weight.data.T
additional_outputA = batch @ (layer_B.weight @ layer_A.weight).T
regular_output.shape, additional_outputA.shape

(torch.Size([1, 1024]), torch.Size([1, 1024]))

In [14]:
additional_outputA

tensor([[-0.0357,  0.0230, -0.4607,  ..., -0.2920,  0.1944,  0.5041]],
       grad_fn=<MmBackward0>)

In [15]:
out_A = (batch @ layer_A.weight.T)
add_out = out_A @ layer_B.weight.T
add_out

tensor([[-0.0357,  0.0230, -0.4607,  ..., -0.2920,  0.1944,  0.5041]],
       grad_fn=<MmBackward0>)

In [16]:
regular_output

tensor([[ 0.3315, -0.2030,  0.0876,  ...,  0.1508,  0.5414, -0.0829]])

In [17]:
output   = regular_output + add_out

In [18]:
output

tensor([[ 0.2958, -0.1800, -0.3731,  ..., -0.1412,  0.7358,  0.4212]],
       grad_fn=<AddBackward0>)

In [78]:
alpha  =  2*r
output = regular_output + (alpha/r) * additional_outputA
output

tensor([[ 0.2600, -0.1569, -0.8338,  ..., -0.4332,  0.9301,  0.9253]],
       grad_fn=<AddBackward0>)

In [79]:
supported = torch.cuda.is_bf16_supported(including_emulation=False)
compute_dtype = (torch.bfloat16 if supported else torch.float32)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=compute_dtype
)

model_q4 = AutoModelForCausalLM.from_pretrained("facebook/opt-350m",
                                                device_map='cuda:0',
                                                torch_dtype=compute_dtype,
                                                quantization_config=nf4_config)

In [80]:
def trainable_parms(model):
    parms = [(name, param.dtype) for name, param in model.named_parameters() if param.requires_grad]
    return parms

trainable_parms(model_q4.model)

[('decoder.embed_tokens.weight', torch.float32),
 ('decoder.embed_positions.weight', torch.float32),
 ('decoder.layers.0.self_attn_layer_norm.weight', torch.float32),
 ('decoder.layers.0.self_attn_layer_norm.bias', torch.float32),
 ('decoder.layers.0.final_layer_norm.weight', torch.float32),
 ('decoder.layers.0.final_layer_norm.bias', torch.float32),
 ('decoder.layers.1.self_attn_layer_norm.weight', torch.float32),
 ('decoder.layers.1.self_attn_layer_norm.bias', torch.float32),
 ('decoder.layers.1.final_layer_norm.weight', torch.float32),
 ('decoder.layers.1.final_layer_norm.bias', torch.float32),
 ('decoder.layers.2.self_attn_layer_norm.weight', torch.float32),
 ('decoder.layers.2.self_attn_layer_norm.bias', torch.float32),
 ('decoder.layers.2.final_layer_norm.weight', torch.float32),
 ('decoder.layers.2.final_layer_norm.bias', torch.float32),
 ('decoder.layers.3.self_attn_layer_norm.weight', torch.float32),
 ('decoder.layers.3.self_attn_layer_norm.bias', torch.float32),
 ('decoder.la

In [81]:
prepared_model = prepare_model_for_kbit_training(model_q4,
                                        use_gradient_checkpointing=True,
                                        gradient_checkpointing_kwargs={'use_reentrant': False})
prepared_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear4bit(in_features=1024, out_features=512, bias=False)
      (project_in): Linear4bit(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=1024, out_features=4096, bias=True)
       

In [82]:
trainable_parms(model_q4.model)

[]

In [91]:
parm

NameError: name 'parm' is not defined

In [93]:
def parms_of_dtype(model, dtype=torch.float32):
  parms = [name for name, param in model.named_parameters() if param.dtype == dtype]
  return parms

parms_of_dtype(prepared_model)


['model.decoder.embed_tokens.weight',
 'model.decoder.embed_positions.weight',
 'model.decoder.layers.0.self_attn.k_proj.bias',
 'model.decoder.layers.0.self_attn.v_proj.bias',
 'model.decoder.layers.0.self_attn.q_proj.bias',
 'model.decoder.layers.0.self_attn.out_proj.bias',
 'model.decoder.layers.0.self_attn_layer_norm.weight',
 'model.decoder.layers.0.self_attn_layer_norm.bias',
 'model.decoder.layers.0.fc1.bias',
 'model.decoder.layers.0.fc2.bias',
 'model.decoder.layers.0.final_layer_norm.weight',
 'model.decoder.layers.0.final_layer_norm.bias',
 'model.decoder.layers.1.self_attn.k_proj.bias',
 'model.decoder.layers.1.self_attn.v_proj.bias',
 'model.decoder.layers.1.self_attn.q_proj.bias',
 'model.decoder.layers.1.self_attn.out_proj.bias',
 'model.decoder.layers.1.self_attn_layer_norm.weight',
 'model.decoder.layers.1.self_attn_layer_norm.bias',
 'model.decoder.layers.1.fc1.bias',
 'model.decoder.layers.1.fc2.bias',
 'model.decoder.layers.1.final_layer_norm.weight',
 'model.decode

In [87]:
parms_of_dtype(prepared_model)

['model.decoder.embed_tokens.weight',
 'model.decoder.embed_positions.weight',
 'model.decoder.layers.0.self_attn.k_proj.bias',
 'model.decoder.layers.0.self_attn.v_proj.bias',
 'model.decoder.layers.0.self_attn.q_proj.bias',
 'model.decoder.layers.0.self_attn.out_proj.bias',
 'model.decoder.layers.0.self_attn_layer_norm.weight',
 'model.decoder.layers.0.self_attn_layer_norm.bias',
 'model.decoder.layers.0.fc1.bias',
 'model.decoder.layers.0.fc2.bias',
 'model.decoder.layers.0.final_layer_norm.weight',
 'model.decoder.layers.0.final_layer_norm.bias',
 'model.decoder.layers.1.self_attn.k_proj.bias',
 'model.decoder.layers.1.self_attn.v_proj.bias',
 'model.decoder.layers.1.self_attn.q_proj.bias',
 'model.decoder.layers.1.self_attn.out_proj.bias',
 'model.decoder.layers.1.self_attn_layer_norm.weight',
 'model.decoder.layers.1.self_attn_layer_norm.bias',
 'model.decoder.layers.1.fc1.bias',
 'model.decoder.layers.1.fc2.bias',
 'model.decoder.layers.1.final_layer_norm.weight',
 'model.decode

In [84]:

for name, param in model_q4.named_parameters():
  print(f"Name: {name}, Shape: {param.shape}")

Name: model.decoder.embed_tokens.weight, Shape: torch.Size([50272, 512])
Name: model.decoder.embed_positions.weight, Shape: torch.Size([2050, 1024])
Name: model.decoder.project_out.weight, Shape: torch.Size([262144, 1])
Name: model.decoder.project_in.weight, Shape: torch.Size([262144, 1])
Name: model.decoder.layers.0.self_attn.k_proj.weight, Shape: torch.Size([524288, 1])
Name: model.decoder.layers.0.self_attn.k_proj.bias, Shape: torch.Size([1024])
Name: model.decoder.layers.0.self_attn.v_proj.weight, Shape: torch.Size([524288, 1])
Name: model.decoder.layers.0.self_attn.v_proj.bias, Shape: torch.Size([1024])
Name: model.decoder.layers.0.self_attn.q_proj.weight, Shape: torch.Size([524288, 1])
Name: model.decoder.layers.0.self_attn.q_proj.bias, Shape: torch.Size([1024])
Name: model.decoder.layers.0.self_attn.out_proj.weight, Shape: torch.Size([524288, 1])
Name: model.decoder.layers.0.self_attn.out_proj.bias, Shape: torch.Size([1024])
Name: model.decoder.layers.0.self_attn_layer_norm.weig

In [85]:
prepared_model.get_memory_footprint()/1e6

264.15104