In [1]:
# Pytorch example: Finetuning Llama2 with LoRA
# https://pytorch.org/torchtune/stable/tutorials/lora_finetune.html

## Minimal LoRA

In [2]:
import torch.nn as nn
from torch import Tensor

In [3]:
class LoRALinear(nn.Module):
    def __init__(self, in_dim:int, out_dim:int, rank:int, alpha:float, dropout:float):
        # These are the weights from the original pretrained model
        self.linear = nn.Linear(in_dim, out_dim, bias=False)
        
        # These are the new LoRA params, In general rank << in_dim & out_dim
        self.lora_a = nn.Linear(in_dim, rank, bias=False)
        self.lora_b = nn.Linear(rank, out_dim, bias=False)
        
        # Rank and alpha are commonly-tuned hyperparameters
        self.rank = rank
        self.alpha = alpha
        
        # Most implementations also include some dropout
        self.dropout = nn.Dropout(p=dropout)
        
        # The original params are frozen, and only LoRA params are trainable
        self.linear.weight.requires_grad = False
        self.lora_a.weight.requires_grad = True
        self.lora_b.weight.requires_grad = True
    
    def forward(self, x:Tensor) -> Tensor:
        # This would be the output of the original model
        frozen_out = self.linear(x)
        
        # lora_a projects inputs down to the much smaller self.rank,
        # then lora_b projects back up to the output dimension
        lora_out = self.lora_b(self.lora_a(self.dropout(x)))
        
        # Finally, scale by the alpha parameter (normalized by rank)
        # and add to the original model's outputs
        return frozen_out + (self.alpha / self.rank) * lora_out

## Applying LoRA to Llama2 models

In [1]:
# problem: 内存占用过大，jupyter内核挂掉了
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
from torchtune.models.llama2 import llama2_7b, lora_llama2_7b  # torchtune是一个torch库，用于轻松创作、微调和试验LLM。
'''
    torchtune, https://pytorch.org/torchtune/stable/index.html
    - Llama3 in torchtune
    - Finetuning with LoRA in torchtune
    - Understanding QLoRA in TorchTune
    - End-to-End Workflow with torchtune
'''

'\n    torchtune, https://pytorch.org/torchtune/stable/index.html\n    - Llama3 in torchtune\n    - Finetuning with LoRA in torchtune\n    - Understanding QLoRA in TorchTune\n    - End-to-End Workflow with torchtune\n'

### usual model

In [3]:
# Build Llama2 without any LoRA layers
base_model = llama2_7b()

In [4]:
# print the first layer's self-attention in the usual Llama2 model
base_model.layers[0].attn

CausalSelfAttention(
  (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (output_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (pos_embeddings): RotaryPositionalEmbeddings()
)

### lora model

In [3]:
# The default settings for lora_llama2_7b will match those for llama2_7b
# We just need to define which layers we want LoRA applied to
# Within each self-attention, we can choose from ['q_proj', 'k_proj', 'v_proj', 'output_proj']
# We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear
# layers outside of the self-attention
lora_model = lora_llama2_7b(lora_attn_modules=['q_proj', 'v_proj'])

In [4]:
# print the same for Llama2 with LoRA weights
lora_model.layers[0].attn

CausalSelfAttention(
  (q_proj): LoRALinear(
    (dropout): Dropout(p=0.0, inplace=False)
    (lora_a): Linear(in_features=4096, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=4096, bias=False)
  )
  (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (v_proj): LoRALinear(
    (dropout): Dropout(p=0.0, inplace=False)
    (lora_a): Linear(in_features=4096, out_features=8, bias=False)
    (lora_b): Linear(in_features=8, out_features=4096, bias=False)
  )
  (output_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (pos_embeddings): RotaryPositionalEmbeddings()
)

### lora load weights of base_model

In [None]:
# Assuming that base_model already has the pretrained Llama2 weights,
# this will directly load them into your LoRA model without any conversion necessary.
lora_model.load_state_dict(base_model.state_dict(), strict=False)

In [7]:
from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params
# get_adapter_params, 返回model中对应于adapter的参数子集
# set_trainable_params, 根据adapter参数的状态字典设置nn.Module的可训练参数

In [12]:
# Fetch all params from the model that are associated with LoRA
lora_params = get_adapter_params(lora_model)

# Set requires_grad=True on lora_params, and requires_grad=False on all others
set_trainable_params(lora_model, lora_params)

# Print the total number of parameters
total_params = sum([p.numel() for p in lora_model.parameters()])
trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])
print(
    f"""
    {total_params} total params,
    {trainable_params}" trainable params,
    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable 
    """
)


    6742609920 total params,
    4194304" trainable params,
    0.06% of all params are trainable 
    


## LoRA finetuning recipe in torchtune

In [None]:
tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora

In [None]:
# Model Arguments
model:
  _component_: lora_llama2_7b
  lora_attn_modules: ['q_proj', 'v_proj']
  lora_rank: 8
  lora_alpha: 16
...