<a href="https://colab.research.google.com/github/tomonari-masada/course2025-nlp/blob/main/10_finetuning_LLMs_with_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LoRAを使ったLLMのfine-tuning

## 準備

In [None]:
import time
from tqdm.auto import tqdm
import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import set_seed, AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType, PeftModel

set_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## データセット

* livedoorニュースコーパスを使う。

In [None]:
!wget https://github.com/tomonari-masada/course2025-nlp/raw/refs/heads/main/livedoor_ds.tar.gz
!tar zxf livedoor_ds.tar.gz

In [None]:
ds = load_from_disk("livedoor_ds")
ds

In [None]:
category_names = [
    'movie-enter',
    'it-life-hack',
    'kaden-channel',
    'topic-news',
    'livedoor-homme',
    'peachy',
    'sports-watch',
    'dokujo-tsushin',
    'smax',
]

num_labels = len(set(ds["train"]["category"]))
print(f"Number of labels: {num_labels}")

## LLM

* ここでは`intfloat/multilingual-e5-large-instruct`を使う。
  * 他のLLMでも、コードは同様に書けばよい。

## トークナイザの取得

In [None]:
model_name = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

* collate関数
  * あとでDataLoaderに使う。

In [None]:
def collate_fn(batch):
    texts = []
    labels = []
    for sample in batch:
        texts.append(sample["content"])
        labels.append(sample["category"])
    tokenized = tokenizer(texts, padding=True, truncation=True, max_length=tokenizer.model_max_length, return_tensors="pt")
    return tokenized.to(device), torch.tensor(labels).to(device)

## モデルの取得

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

## LoRA

### LLMの構造の確認
* どの部分をLoRAアダプタで更新するかを決める。

In [None]:
model

### LoRAの設定

* https://huggingface.co/docs/peft/en/package_reference/lora

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False, # trainingをするのでFalseに設定
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["word_embeddings", "query", "value", "key", "dense"],
)
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

In [None]:
lora_model

* `requires_grad`がTrueになっているパラメータを調べてみる。

In [None]:
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape)

## 評価のためのヘルパ関数

In [None]:
def evaluation(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0.0
    with torch.no_grad():
        for tokenized, labels in tqdm(dataloader):
            logits = model(**tokenized).logits
            loss = criterion(logits, labels)
            total_loss += loss.item() * labels.size(0)
            total_acc += (logits.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    return total_loss / total_count, total_acc / total_count

## trainingのためのヘルパ関数

In [None]:
def train(model, dataloader, eval_dataloader, optimizer, criterion, gradient_accumulation_steps=1, eval_interval=100, log_interval=50):
    model.train()
    total_acc, total_loss, total_count = 0, 0, 0

    start_time = time.time()
    num_of_seen_batches = 0
    for tokenized, labels in tqdm(dataloader):
        num_of_seen_batches += 1
        logits = model(**tokenized).logits
        loss = criterion(logits, labels) / gradient_accumulation_steps
        loss.backward()

        total_acc += (logits.argmax(1) == labels).sum().item()
        total_loss += loss.item() * labels.size(0)
        total_count += labels.size(0)

        if num_of_seen_batches % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if log_interval > 0 and num_of_seen_batches % log_interval == 0:
            print(
                f"||| {num_of_seen_batches:5d}/{len(dataloader):5d} batches | "
                f"time: {time.time() - start_time:5.2f}s | "
                f"accuracy {total_acc / total_count:8.3f} | "
                f"loss {total_loss / total_count:8.3f}",
                flush=True,
            )
            total_acc, total_loss, total_count = 0, 0, 0

        if eval_interval > 0 and num_of_seen_batches % eval_interval == 0:
            val_loss, val_accuracy = evaluation(model, eval_dataloader, criterion)
            print("-" * 59)
            print(
                f"| validation loss {val_loss:8.3f} | "
                f"validation accuracy {val_accuracy:8.3f}"
            )
            print("-" * 82)

## DataLoaderの作成

* バッチサイズはGPUのメモリ量に応じて決める。

In [None]:
batch_size = 4

train_dataloader = DataLoader(ds["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ds["validation"], batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(ds["test"], batch_size=batch_size, collate_fn=collate_fn)

## fine-tuning

### trainingの設定

In [None]:
epochs = 5 # 実際はもう少し多めに回す
learning_rate = 1e-4
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=learning_rate)

### LoRAのtraining

* RTX4090で実行すると、6分弱でvalidation accuracyが0.95を超えます。

In [None]:
for epoch in range(epochs):
    epoch_start_time = time.time()
    train(lora_model, train_dataloader, valid_dataloader, optimizer, criterion, gradient_accumulation_steps=8, log_interval=50, eval_interval=200)
    print("-" * 59)
    elapsed = time.time() - epoch_start_time
    print(
        f"| end of epoch {epoch+1:3d} | "
        f"time: {elapsed:5.2f}s | "
        f"lr = {optimizer.param_groups[0]['lr']:.3f} | "
    )
    print("-" * 82)

## test setでの評価

In [None]:
loss, accu_val = evaluation(lora_model, test_dataloader, criterion)
print(f"test loss {loss:8.3f} | test accuracy {accu_val:8.3f}")

## LoRAの保存

In [None]:
adapter_path = "lora_finetuned_model"
lora_model.save_pretrained(adapter_path)

## LoRAの読み込み

* モデルを一旦削除する。

In [None]:
del lora_model

* モデルを読み込み直してから、学習済みのLoRAを適用する。

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
adapter_path = "lora_finetuned_model"
lora_model = PeftModel.from_pretrained(model, adapter_path)

* 先ほどと同じ評価値が出るはず。

In [None]:
loss, accu_val = evaluation(lora_model, test_dataloader, criterion)
print(f"test loss {loss:8.3f} | test accuracy {accu_val:8.3f}")