In [1]:
import sys
import logging
import bitsandbytes as bnb
from bitsandbytes.nn import Linear4bit
import bitsandbytes as bnb
import tqdm
import datasets
from datasets import load_dataset
from peft import LoraConfig,PeftConfig, PeftModel, PeftModelForCausalLM
import torch
import transformers
from trl import SFTTrainer
from typing import List, Dict, Any, Tuple
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import safetensors
import torch.nn as nn
from functools import partial


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
safetensor_path = "/scratch/tathagato/redo_adapter_experiments/length/length/adapter_model.safetensors"
adapter_path = "/scratch/tathagato/redo_adapter_experiments/length/length/"

In [3]:
loaded_tensors = safetensors.torch.load_file(safetensor_path)

# Inspect the structure and shape of the tensors
for tensor_name, tensor_data in loaded_tensors.items():
    print(f"Tensor name: {tensor_name}")
    print(f"Shape: {tensor_data.shape}")
    print(f"Data type: {tensor_data.dtype}")
    print(f"Device: {tensor_data.device}")
    print(f"Tensor data (first few elements): {tensor_data.flatten()[:5]}")  # Print first few elements for inspection
    print()

Tensor name: base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight
Shape: torch.Size([16, 2048])
Data type: torch.float32
Device: cpu
Tensor data (first few elements): tensor([-0.0347, -0.0062,  0.0624, -0.0074,  0.0427])

Tensor name: base_model.model.model.layers.0.self_attn.k_proj.lora_B.weight
Shape: torch.Size([256, 16])
Data type: torch.float32
Device: cpu
Tensor data (first few elements): tensor([ 5.5706e-04, -8.3941e-04, -1.4668e-05,  2.8237e-03, -1.2521e-03])

Tensor name: base_model.model.model.layers.0.self_attn.o_proj.lora_A.weight
Shape: torch.Size([16, 2048])
Data type: torch.float32
Device: cpu
Tensor data (first few elements): tensor([ 0.0120,  0.0225, -0.0087,  0.0140,  0.0249])

Tensor name: base_model.model.model.layers.0.self_attn.o_proj.lora_B.weight
Shape: torch.Size([2048, 16])
Data type: torch.float32
Device: cpu
Tensor data (first few elements): tensor([ 0.0048, -0.0169, -0.0033, -0.0135,  0.0060])

Tensor name: base_model.model.model.layers.0.self_att

In [3]:
base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
model_kwargs = dict(
        use_cache=False,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map=None,
        cache_dir = "/scratch/tathagato",
        attn_implementation = "eager",
        quantization_config = nf4_config, 

    )
quantized_model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(base_model,cache_dir = "/scratch/tathagato")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [10]:
y = model.model.layers[0].self_attn.q_proj
#should 2048,2048
print(y.weight.device)
print(y.weight.shape)
#get a random matrix of shape (1,32,2048)
x = torch.randn(1,32,2048).to(y.weight.device)
print(x.device)
z = y(x)
print(z.shape)

cuda:0
torch.Size([2097152, 1])
cuda:0
torch.Size([1, 32, 2048])


In [11]:
non_quantized_model = AutoModelForCausalLM.from_pretrained(base_model, trust_remote_code=True, use_cache=False, cache_dir = "/scratch/tathagato")
print(non_quantized_model)
print(non_quantized_model.model.layers[0].self_attn.k_proj.weight.shape)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [12]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
#load safetensors from a path 


In [79]:


# Step 1: Initialize W1_a and move it to the desired device (e.g., GPU)
W1_a = torch.nn.Parameter(torch.randn(64, 32) * 1)
#W1_a = bnb.nn.Params4bit(data = W1_a, "nf4")
W1_a = W1_a.to(0)  # Move to GPU (device 0)

# Step 2: Determine input and output features
input_features = W1_a.shape[1]
output_features = W1_a.shape[0]

# Step 3: Instantiate the Linear4bit layer without bias
linear4bit_layer = bnb.nn.Linear4bit(input_features, output_features, bias=False)

# Step 4: Assign the pre-existing weight to the Linear4bit layer's weight
# Note: This assignment is done correctly for initialization purposes,
# but quantization might affect the internal representation.
with torch.no_grad():
    linear4bit_layer.weight = bnb.nn.Params4bit(W1_a,requires_grad=True, quant_type="nf4", quant_storage= torch.float)
# Step 5: Move the Linear4bit layer to the same device as W1_a (if not already done)
# This is already done in Step 3

linear4bit_layer = linear4bit_layer.to(0)

print(linear4bit_layer)
print(linear4bit_layer.weight.shape, linear4bit_layer.weight.device, linear4bit_layer.weight.dtype)


# Additional check for another Linear4bit layer
x2 = bnb.nn.Linear4bit(64, 32, bias=False)
print(x2.weight.device, x2.weight.shape)
x2 = bnb.nn.Linear4bit(64, 32, bias=False).to(0)
print(x2.weight.device)
print(x2.weight.shape)


Linear4bit(in_features=32, out_features=64, bias=False)
torch.Size([256, 1]) cuda:0 torch.float32
cpu torch.Size([32, 64])
cuda:0
torch.Size([1024, 1])


In [12]:
import math 
print(math.log(262144,2), math.log(2048,2))
print(2**(math.log(262144,2) - math.log(2048,2)))


18.0 11.0
128.0


In [14]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm

In [27]:
adapter_model = PeftModelForCausalLM.from_pretrained(model, adapter_path, "test")

NameError: name 'model' is not defined

In [76]:
print(adapter_model.base_model.model.model.layers[0].self_attn.k_proj.base_layer.weight.shape,adapter_model.base_model.model.model.layers[0].self_attn.k_proj.lora_A.test.weight.shape,adapter_model.base_model.model.model.layers[0].self_attn.k_proj.lora_A.test.weight.dtype)

torch.Size([262144, 1]) torch.Size([16, 2048]) torch.float32


In [16]:
model.model.layers[0].self_attn.k_proj

lora.Linear4bit(
  (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
  (lora_dropout): ModuleDict(
    (test): Dropout(p=0.05, inplace=False)
  )
  (lora_A): ModuleDict(
    (test): Linear(in_features=2048, out_features=16, bias=False)
  )
  (lora_B): ModuleDict(
    (test): Linear(in_features=16, out_features=256, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [77]:
adapter_model.base_model.model.model.layers[0].self_attn.k_proj

lora.Linear4bit(
  (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
  (lora_dropout): ModuleDict(
    (test): Dropout(p=0.05, inplace=False)
  )
  (lora_A): ModuleDict(
    (test): Linear(in_features=2048, out_features=16, bias=False)
  )
  (lora_B): ModuleDict(
    (test): Linear(in_features=16, out_features=256, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)

In [70]:
adapter_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (test): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (test): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (test): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_featur

In [6]:
a = nn.Linear(5,10)
print(a.weight.shape)
b = nn.Linear(10,11)
print(b.weight.shape)
c = torch.randn(3,5)
print(c.shape)
r = b(a(c))
print(r.shape)

torch.Size([10, 5])
torch.Size([11, 10])
torch.Size([3, 5])
torch.Size([3, 11])


In [4]:
class CascadedLoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank_1, rank_2, alpha_1, alpha_2, dropout = None, adapter_name = "test"):
        super().__init__()
        std_dev_1 = 1 / torch.sqrt(torch.tensor(rank_1).float())
        std_dev_2 = 1 / torch.sqrt(torch.tensor(rank_2).float())
        if dropout is not None:
            self.dropout = nn.ModuleDict(torch.nn.Dropout(dropout), inplace = False)
        self.W1 = nn.ModuleDict(
            {
                "A" : torch.nn.Linear(in_dim, rank_1, bias = False),
                "B" : torch.nn.Linear(rank_1, out_dim, bias = False)
            }
        )
        self.W1['A'].weight = torch.nn.Parameter(torch.randn(rank_1, in_dim) * std_dev_1)
        self.W1['B'].weight = torch.nn.Parameter(torch.zeros(out_dim, rank_1))
        self.W2 = nn.ModuleDict(
            {
                "A1" : torch.nn.Linear(in_dim, rank_2, bias = False),
                "A2" : torch.nn.Linear(rank_2, rank_1, bias = False),
                "B1" : torch.nn.Linear(rank_1, rank_2, bias = False),
                "B2" : torch.nn.Linear(rank_2, out_dim, bias = False)
            }
        )
        self.W2['A1'].weight = torch.nn.Parameter(torch.randn(rank_2, in_dim) * std_dev_2)
        self.W2['A2'].weight = torch.nn.Parameter(torch.zeros(rank_1, rank_2))
        self.W2['B1'].weight = torch.nn.Parameter(torch.zeros(rank_2, rank_1))
        self.W2['B2'].weight = torch.nn.Parameter(torch.zeros(out_dim, rank_2) * std_dev_2)
        self.alpha_1 = alpha_1
        self.alpha_2 = alpha_2
        self.is_second_layar_being_trained = False
        self.is_first_layer_being_trained = False
        self.is_first_layer_being_used_for_inference = True
        self.is_first_layer_being_used_for_inference = True
        self.scaling_1 = self.alpha_1 / self.rank_1
        self.scaling_2 = self.alpha_2 / self.rank_2



    def set_gradients_for_all_layer(self):
        if self.is_second_layar_being_trained:
            self.W2_a1.requires_grad = True
            self.W2_a2.requires_grad = True
            self.W2_b1.requires_grad = True
            self.W2_b2.requires_grad = True
        else:
            self.W2_a1.requires_grad = False
            self.W2_a2.requires_grad = False
            self.W2_b1.requires_grad = False
            self.W2_b2.requires_grad = False
            
        if self.is_first_layer_being_trained:
            self.W1_a.requires_grad = True
            self.W1_b.requires_grad = True
        else:
            self.W1_a.requires_grad = False
            self.W1_b.requires_grad = False
    
    def tune_the_first_adapter(self):
        self.is_first_layer_being_trained = True
    
    def freeze_the_first_adapter(self):
        self.is_first_layer_being_trained = False
    
    def tune_the_second_adapter(self):
        self.is_second_layar_being_trained = True
    
    def freeze_the_second_adapter(self):
        self.is_second_layar_being_trained = False
    


    def forward(self, x):
        self.set_gradients_for_all_layer()
        if self.is_first_layer_being_used_for_inference and self.is_second_layer_being_used_for_inference:
            #x = self.scaling_1 * (x @ self.W1_a @ self.W1_b) + self.scaling_2 * (x @ self.W2_a1 @ self.W2_a2 @ self.W2_b1 @ self.W2_b2)
            x = self.scaling_1 * (self.W1['A'](self.W1['B'](x))) + self.scaling_2 * (self.W2['B2'](self.W2['A2'](self.W2['B1'](self.W2['A1'](x)))))
        if self.is_first_layer_being_used_for_inference and not self.is_second_layer_being_used_for_inference:
            #x = self.scaling_2 * (x @ self.W2_a1 @ self.W2_a2) 
            x = self.scaling_1 * (self.W1['A'](self.W1['B'](x))) 
        return x
"""
(base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (test): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (test): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (test): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
"""


class CascadedLoRALinear4bit(torch.nn.Module):
    def __init__(self, linear, in_dim, out_dim, rank_1 = 64, rank_2 = 32, alpha_1 = 16, alpha_2 = 16, adapter_name = "default" , dropout = None):
        super().__init__()
        self.base_layer = linear
        std_dev_1 = 1 / torch.sqrt(torch.tensor(rank_1).float())
        std_dev_2 = 1 / torch.sqrt(torch.tensor(rank_2).float())
        if dropout is not None:
            self.lora_dropout = nn.ModuleDict(
                {
                    adapter_name : torch.nn.Dropout(dropout)
                }
            )
        #first dimension
        self.lora_A = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(in_dim, rank_1, bias = False)
            }
        )
        self.lora_B = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(rank_1, out_dim, bias = False)
            }
        )
        self.lora_A[adapter_name].weight = torch.nn.Parameter(torch.randn(rank_1, in_dim) * std_dev_1)
        self.lora_B[adapter_name].weight = torch.nn.Parameter(torch.zeros(out_dim, rank_1))  

        self.lora_A1 = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(in_dim, rank_2, bias = False)
            }
        )
        self.lora_A2 = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(rank_2, rank_1, bias = False)
            }
        )
        self.lora_B1 = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(rank_1, rank_2, bias = False)
            }
        )
        self.lora_B2 = nn.ModuleDict(
            {
                adapter_name : torch.nn.Linear(rank_2, out_dim, bias = False)
            }
        )
        self.lora_A1[adapter_name].weight = torch.nn.Parameter(torch.randn(rank_2, in_dim) * std_dev_2)
        self.lora_A2[adapter_name].weight = torch.nn.Parameter(torch.zeros(rank_1, rank_2))
        self.lora_B1[adapter_name].weight = torch.nn.Parameter(torch.zeros(rank_2, rank_1))
        self.lora_B2[adapter_name].weight = torch.nn.Parameter(torch.zeros(out_dim, rank_2) * std_dev_2)  
        self.alpha_1 = alpha_1
        self.alpha_2 = alpha_2
        self.rank_1 = rank_1
        self.rank_2 = rank_2
        self.is_second_layar_being_trained = False
        self.is_first_layer_being_trained = False
        self.is_first_layer_being_used_for_inference = True
        self.is_first_layer_being_used_for_inference = True
        self.scaling_1 = self.rank_1 / self.alpha_1
        self.scaling_2 = self.rank_2 / self.alpha_1
        self.adapter_name = adapter_name



    def set_gradients_for_all_layer(self):
        if self.is_second_layar_being_trained:
            self.lora_A1[self.adapter_name].requires_grad = True
            self.lora_A2[self.adapter_name].requires_grad = True
            self.lora_B1[self.adapter_name].requires_grad = True
            self.lora_B2[self.adapter_name].requires_grad = True


        else:
            self.lora_A1[self.adapter_name].requires_grad = False
            self.lora_A2[self.adapter_name].requires_grad = False
            self.lora_B1[self.adapter_name].requires_grad = False
            self.lora_B2[self.adapter_name].requires_grad = False
            
        if self.is_first_layer_being_trained:
            self.lora_A[self.adapter_name].requires_grad = True
            self.lora_B[self.adapter_name].requires_grad = True
        else:
            self.lora_A[self.adapter_name].requires_grad = False
            self.lora_B[self.adapter_name].requires_grad = False  
    
    def tune_the_first_adapter(self):
        self.is_first_layer_being_trained = True
    
    def freeze_the_first_adapter(self):
        self.is_first_layer_being_trained = False
    
    def tune_the_second_adapter(self):
        self.is_second_layar_being_trained = True
    
    def freeze_the_second_adapter(self):
        self.is_second_layar_being_trained = False

    def forward(self, x):

        self.set_gradients_for_all_layer()
        if self.is_first_layer_being_used_for_inference and self.is_second_layer_being_used_for_inference:
            #x = self.scaling_1 * (x @ self.W1_a @ self.W1_b) + self.scaling_2 * (x @ self.W2_a1 @ self.W2_a2 @ self.W2_b1 @ self.W2_b2)
            output  = self.linear(x) + self.scaling_1 * (self.W1['A'](self.W1['B'](x))) + self.scaling_2 * (self.W2['B2'](self.W2['A2'](self.W2['B1'](self.W2['A1'](x)))))
        if self.is_first_layer_being_used_for_inference and not self.is_second_layer_being_used_for_inference:
            #x = self.scaling_2 * (x @ self.W2_a1 @ self.W2_a2) 
            output  =  self.linear(x)  + self.scaling_1 * (self.W1['A'](self.W1['B'](x))) 
        return output
    







In [5]:

rank_1 = 64
rank_2 = 32
alpha_1 = 16
alpha_2 = 16
adapter_name = "test"
dropout = 0.05
target_modules = [
                    'q_proj',
                    'k_proj',
                    'v_proj',
                    'o_proj',
                    'gate_proj',
                    'up_proj',
                    'down_proj'
]


In [43]:
def replace_with_cascaded_lora(module, target_modules = target_modules, rank_1 = 64, rank_2 = 32, alpha_1 = 16 , alpha_2 = 16 , adapter_name = "default" , dropout = None):
    for name, child in module.named_children():
        if isinstance(child, bnb.nn.Linear4bit) and name in target_modules:
            #setattr(module, name, CascadedLoRALinear4bit(child, in_dim, out_dim, **kwargs))
            #print(name)
            #print(child.in_features, child.out_features)
            setattr(module, name, CascadedLoRALinear4bit(child, child.in_features, child.out_features, rank_1, rank_2, alpha_1, alpha_2, adapter_name , dropout = dropout))
        else:
            replace_with_cascaded_lora(child, target_modules, rank_1, rank_2, alpha_1, alpha_2, adapter_name , dropout = None)
def print_device_and_dtype(model, file = sys.stdout):

    for name, module in model.named_modules():
        # Get the device and dtype of the module's parameters
        #file = open(file, "a")
        try:
            param = next(module.parameters())
            device = param.device
            dtype = param.dtype
            type = param.type()
        except StopIteration:
            device = 'No parameters'
            dtype = 'No parameters'
            type = 'No parameters'

        
        # Print the name, device, and dtype of the module
        print(f"Module: {name}", file = file)
        print(f"  Device: {device}", file = file)
        print(f"  Dtype: {dtype}", file = file)
        print(f"  Type: {type}", file = file)
        print(" ",file = file )



replace_with_cascaded_lora(quantized_model)
#print(quantized_model)
print_device_and_dtype(quantized_model, file = "cascaded_lora_structure.txt")



AttributeError: 'str' object has no attribute 'write'

In [39]:
base_model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
base_model_quantized = AutoModelForCausalLM.from_pretrained(base_model_path, **model_kwargs)
adapter_model = PeftModelForCausalLM.from_pretrained(base_model_quantized, adapter_path, "test")
#print(adapter_model)
print_device_and_dtype(adapter_model, file = "./peft_structure.txt")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


TypeError: print_device_and_dtype() got an unexpected keyword argument 'file'

In [None]:
#https://github.com/microsoft/LoRA/blob/main/loralib/layers.py#L12
class LoRALayer():
    def __init__(
        self, 
        rank_1 : int,
        rank_2 : int, 
        lora_alpha_1: int,
        lora_alpha_2: int, 
        lora_dropout: float,
        merge_weights: bool,
    ):
        self.rank_1 = rank_1
        self.rank_2 = rank_2
        self.lora_alpha_1 = lora_alpha_1
        self.lora_alpha_2 = lora_alpha_2
        # Optional dropout
        if lora_dropout > 0.:
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_dropout = lambda x: x
        # Mark the weight as unmerged
        self.merged = False
        self.merge_weights = merge_weights