In [1]:
# https://www.kaggle.com/code/emiz6413/training-gemma-2-9b-4-bit-qlora-fine-tuning

In [2]:
# gemma-2 is available from transformers>=4.42.3
!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

[0m

In [3]:
!pip install datasets

[0m

In [4]:
!pip install scikit-learn

[0m

In [5]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    Qwen2TokenizerFast,
    Qwen2ForSequenceClassification,
    AutoTokenizer,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [6]:
@dataclass
class Config:
    output_dir: str = "qwen2001"
    checkpoint: str = "unsloth/Qwen2-72B-bnb-4bit"  # 4-bit quantized llama-3.1-8b-instruct
    max_length: int = 256
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 1
    gradient_accumulation_steps: int = 4  # global batch size is 24
    per_device_eval_batch_size: int = 24
    n_epochs: int = 1
    freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_steps: int = 20
    lora_r: int = 16
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    
config = Config()

In [7]:
training_args = TrainingArguments(
    output_dir=config.output_dir,
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=200,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,
)

In [8]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    #target_modules=["q_proj", "k_proj", "v_proj"],
    target_modules=['k_proj','up_proj','gate_proj','q_proj','base_layer','o_proj','v_proj','down_proj'],
    #target_modules=["q_proj", "k_proj", 'v_proj','o_proj','gate_proj'],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

In [9]:
tokenizer = Qwen2TokenizerFast.from_pretrained(config.checkpoint)#モデルによって変える
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [10]:
model = Qwen2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at unsloth/Qwen2-72B-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Qwen2ForSequenceClassification(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 8192, padding_idx=151646)
        (layers): ModuleList(
          (0-15): 16 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): Linear4bit(in_features=8192, out_features=8192, bias=True)
              (k_proj): Linear4bit(in_features=8192, out_features=1024, bias=True)
              (v_proj): Linear4bit(in_features=8192, out_features=1024, bias=True)
              (o_proj): Linear4bit(in_features=8192, out_features=8192, bias=False)
              (rotary_emb): Qwen2RotaryEmbedding()
            )
            (mlp): Qwen2MLP(
              (gate_proj): Linear4bit(in_features=8192, out_features=29568, bias=False)
              (up_proj): Linear4bit(in_features=8192, out_features=29568, bias=False)
              (down_proj): Linear4bit(in_features=29568, out_features=8192, bia

In [11]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  # (default:torch.nn.Linear,4bit:bnb.nn.Linear4bit,8bit:bnb.nn.Linear8bitLt)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

find_all_linear_names(model)

['gate_proj',
 'down_proj',
 'up_proj',
 'k_proj',
 'v_proj',
 'o_proj',
 'base_layer',
 'q_proj']

In [12]:
#層の表示
for name, param in model.named_parameters():
    print(name)

base_model.model.model.embed_tokens.weight
base_model.model.model.layers.0.self_attn.q_proj.weight
base_model.model.model.layers.0.self_attn.q_proj.bias
base_model.model.model.layers.0.self_attn.k_proj.weight
base_model.model.model.layers.0.self_attn.k_proj.bias
base_model.model.model.layers.0.self_attn.v_proj.weight
base_model.model.model.layers.0.self_attn.v_proj.bias
base_model.model.model.layers.0.self_attn.o_proj.weight
base_model.model.model.layers.0.mlp.gate_proj.weight
base_model.model.model.layers.0.mlp.up_proj.weight
base_model.model.model.layers.0.mlp.down_proj.weight
base_model.model.model.layers.0.input_layernorm.weight
base_model.model.model.layers.0.post_attention_layernorm.weight
base_model.model.model.layers.1.self_attn.q_proj.weight
base_model.model.model.layers.1.self_attn.q_proj.bias
base_model.model.model.layers.1.self_attn.k_proj.weight
base_model.model.model.layers.1.self_attn.k_proj.bias
base_model.model.model.layers.1.self_attn.v_proj.weight
base_model.model.mo

In [13]:
ds = Dataset.from_csv("../train.csv")
#ds = ds.select(torch.arange(100))  # We only use the first 100 data for demo purpose

In [14]:
import pandas as pd
train = pd.read_csv("../train.csv")
train.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count
0,0,25,3-season skirt!,"Adorable, well-made skirt! lined and very slim...",5,1,4
1,0,39,Very cute,Love the asymmetrical hem. waist fit snugly as...,5,1,0
2,0,42,Beautiful! fruns small for typical retailer si...,I love this skirt! i wasn't sure about the mix...,5,1,5
3,0,45,,I was really pleased with this skirt. the ligh...,5,1,9
4,0,57,"Unique, pretty asymmetric skirt",I saw this skirt in retailer several months ag...,5,1,1


In [15]:
train.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count'],
      dtype='object')

In [16]:
from typing import Any, Dict, List

class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, batch: dict) -> dict:
        Clothing_ID = ["<Clothing ID>: " + self.process_text(t) for t in batch["Clothing ID"]]
        Age = ["\n\n<Age>: " + self.process_text(t) for t in batch["Age"]]  # "AGe"を"Age"に修正
        Title = ["\n\n<Title>: " + self.process_text(t) for t in batch["Title"]]
        Review_Text = ["\n\n<Review Text>: " + self.process_text(t) for t in batch["Review Text"]]
        Positive_Feedback_Count = ["\n\n<Positive Feedback Count>: " + self.process_text(t) for t in batch["Positive Feedback Count"]]
        
        texts = [c + a + t + r + p for c, a, t, r, p in zip(Clothing_ID, Age, Title, Review_Text, Positive_Feedback_Count)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)

        labels = batch["Recommended IND"]  # ラベルの処理を簡略化
            
        return {**tokenized, "labels": labels}
        
    @staticmethod
    def process_text(text: Any) -> str:
        if isinstance(text, str):
            try:
                return " ".join(eval(text, {"null": ""}))
            except:
                return str(text)
        else:
            return str(text)

In [17]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)

In [18]:
ds["labels"]

[1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,


In [19]:
#debug
print(tokenizer.decode(ds["input_ids"][0]))

<Clothing ID>: 0

<Age>: 25

<Title>: 3-season skirt!

<Review Text>: Adorable, well-made skirt! lined and very slimming. i had to size up b/c it runs a bit snug around the waist. however, it's worth it b/c this will match many long and short sleeve tops!

<Positive Feedback Count>: 4


In [20]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    
    # ソフトマックス関数を適用して確率に変換
    probs = np.exp(preds) / np.sum(np.exp(preds), axis=1, keepdims=True)
    
    # 予測クラスを取得
    pred_classes = np.argmax(probs, axis=1)
    
    # 正解率（Accuracy）を計算
    acc = accuracy_score(y_true=labels, y_pred=pred_classes)
    
    # ROC AUC スコアを計算（マルチクラスの場合は'ovr'を使用）
    num_classes = probs.shape[1]
    if num_classes == 2:
        # バイナリ分類の場合
        auc = roc_auc_score(labels, probs[:, 1])
    else:
        # マルチクラス分類の場合
        auc = roc_auc_score(labels, probs, multi_class='ovr', average='macro')
    
    return {"acc": acc, "roc_auc": auc}

In [21]:
folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    ) 
    for fold_idx in range(config.n_splits)
]

In [22]:
train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),#ds,ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),#ds.select(torch.arange(100)),#ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()



Epoch,Training Loss,Validation Loss,Acc,Roc Auc
1,0.0686,0.257942,0.9335,0.97556




TrainOutput(global_step=2000, training_loss=0.3791950275599957, metrics={'train_runtime': 29545.2763, 'train_samples_per_second': 0.271, 'train_steps_per_second': 0.068, 'total_flos': 3.3702726972063744e+17, 'train_loss': 0.3791950275599957, 'epoch': 1.0})