<a href="https://colab.research.google.com/github/tomonari-masada/course2025-nlp/blob/main/10_finetuning_LLMs_with_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LoRAを使ったLLMのfine-tuning

## 準備

In [None]:
import time
from tqdm.auto import tqdm
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
import evaluate
from transformers import set_seed, AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType, PeftModel

set_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## データセット

* livedoorニュースコーパスを使う。

In [None]:
!wget https://github.com/tomonari-masada/course2025-nlp/raw/refs/heads/main/livedoor_ds.tar.gz
!tar zxf livedoor_ds.tar.gz

In [None]:
ds = load_from_disk("livedoor_ds")
ds

In [None]:
category_names = [
    'movie-enter',
    'it-life-hack',
    'kaden-channel',
    'topic-news',
    'livedoor-homme',
    'peachy',
    'sports-watch',
    'dokujo-tsushin',
    'smax',
]

num_labels = len(set(ds["train"]["category"]))
num_labels

## LLMの選定

* ここでは`intfloat/multilingual-e5-large-instruct`を使う。
  * 他のLLMでも、コードは同様に書けばよい。

## トークナイザの取得

In [None]:
model_name = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

* collate関数
  * あとでDataLoaderに使う。

In [None]:
def collate_fn(batch):
    texts = []
    labels = []
    for sample in batch:
        texts.append(sample["content"])
        labels.append(sample["category"])
    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
        return_tensors="pt"
    )
    return tokenized.to(device), torch.tensor(labels).to(device)

## モデルの取得

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
).to(device)

## LoRA

### LLMの構造の確認
* どの部分をLoRAアダプタで更新するかを決める。

In [None]:
model

### LoRAの設定

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["word_embeddings", "query", "value", "key", "dense"],
)
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

In [None]:
lora_model

## trainingのためのヘルパ関数

In [None]:
def train(model, dataloader, optimizer, criterion):
    model.train()
    total_acc, total_loss, total_count = 0, 0, 0
    log_interval = 10
    num_of_seen_batches = 0

    start_time = time.time()
    for tokenized, labels in tqdm(dataloader):
        optimizer.zero_grad()
        logits = model(**tokenized).logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
    total_acc += (logits.argmax(1) == labels).sum().item()
    total_loss += loss.item() * labels.size(0)
    total_count += labels.size(0)
    num_of_seen_batches += 1
    if num_of_seen_batches % log_interval == 0 and num_of_seen_batches > 0:
        print(
            f"||| {num_of_seen_batches:5d}/{len(dataloader):5d} batches | "
            f"time: {time.time() - start_time:5.2f}s | "
            f"accuracy {total_acc / total_count:8.3f} | "
            f"loss {total_loss / total_count:8.3f}"
        )
        total_acc, total_loss, total_count = 0, 0, 0

## 評価のためのヘルパ関数

In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for tokenized, labels in tqdm(dataloader):
            logits = model(**tokenized).logits
            loss = criterion(logits, labels)
            total_acc += (logits.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    return total_acc / total_count

## DataLoaderの作成

* バッチサイズはGPUのメモリ量に応じて決める。

In [None]:
batch_size = 4

train_dataloader = DataLoader(ds["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ds["validation"], batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(ds["test"], batch_size=batch_size, collate_fn=collate_fn)

## fine-tuning

### trainingの設定

In [None]:
epochs = 3
learning_rate = 1e-4
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=learning_rate)

### LoRAのtraining

In [None]:
for epoch in range(epochs):
    epoch_start_time = time.time()
    train(lora_model, train_dataloader, optimizer, criterion)
    accu_val = evaluate(lora_model, valid_dataloader, criterion)
    print("-" * 59)
    elapsed = time.time() - epoch_start_time
    print(
        f"| end of epoch {epoch+1:3d} | "
        f"time: {elapsed:5.2f}s | "
        f"lr = {optimizer.param_groups[0]['lr']:.3f} | "
        f"validation accuracy {accu_val:8.3f}"
    )
    print("-" * 82)

## test setでの評価

In [None]:
accu_val = evaluate(lora_model, test_dataloader, criterion)
print(f"test accuracy {accu_val:8.3f}")

## LoRAの保存

In [None]:
adapter_path = "lora_finetuned_model"
lora_model.save_pretrained(adapter_path)

## LoRAの読み込み

* 下のセルでは、あらかじめ元のモデルは読み込んであると想定している。

In [None]:
adapter_path = "lora_finetuned_model"
lora_model = PeftModel.from_pretrained(model, adapter_path)

In [None]:
accu_val = evaluate(lora_model, test_dataloader, criterion)
print(f"test accuracy {accu_val:8.3f}")