In [1]:
import gc
import os
import time
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import safetensors
import seaborn as sns
import torch
from datasets import Dataset
from hydra import compose, initialize
from jinja2 import Template
from matplotlib import pyplot as plt
from omegaconf import OmegaConf
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from src.seed import seed_everything
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
)

warnings.filterwarnings("ignore")


with initialize(config_path="config", version_base=None):
    cfg = compose(config_name="config")
    cfg.exp_number = Path().resolve().name

print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)
pl.Config.set_fmt_str_lengths(100)


gemma:
  model_path: unsloth/gemma-2-9b-it-bnb-4bit
  metric: auc
  max_length: 1024
  fp16: true
  learning_rate: 0.0001
  epochs: 2
  per_device_train_batch_size: 4
  gradient_accumulation_steps: 16
  per_device_eval_batch_size: 8
  steps: 50
  lr_scheduler_type: cosine
  weight_decay: 0.01
  optim: adamw_torch_fused
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  lora_bias: none
exp_number: '017'
run_name: base
data:
  data_root: ../../data
  results_root: ../../results
  train_path: ../../data/train.csv
  cloth_path: ../../data/clothing_master.csv
  test_path: ../../data/test.csv
  sample_submission_path: ../../data/sample_submission.csv
  results_dir: ../../results/017/base
seed: 42
n_splits: 4
target: Recommended IND



polars.config.Config

### LLM(gemma-2-9b-it)をQLoRAでファインチューニング
- 2nd & 7th place solを参考に実装
  - https://www.guruguru.science/competitions/24/discussions/4f2c7270-b67e-4e34-855a-3246f03cc278/
  - https://www.guruguru.science/competitions/24/discussions/bdfb41e9-a1ef-40e8-b67d-742b5a4458a2/
- Gemma2-9B(instruct tuning)のSequenceClassification
- 4bit量子化されたモデル(unsloth/gemma-2-9b-it-bnb-4bit)をQLoRAでファインチューニング
- エラー発生のため1foldのみの学習に留まる

### データの準備


In [2]:
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True).fill_null("none")
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True).fill_null("none")
cloth_df = pl.read_csv(cfg.data.cloth_path, try_parse_dates=True)

train_df = train_df.join(cloth_df, on="Clothing ID", how="left")
test_df = test_df.join(cloth_df, on="Clothing ID", how="left")

# labels列を作成
train_df = train_df.with_columns(pl.col(cfg.target).cast(pl.Int8).alias("labels"))

skf = StratifiedKFold(n_splits=cfg.n_splits, shuffle=True, random_state=cfg.seed)


In [3]:
# debug設定
DEBUG = False

if DEBUG:
    train_df = train_df.head(100)
    test_df = test_df.head(100)


In [4]:
# promptを作成
prompt_template = Template("""This reviewer's Age is {{ age }}.
The Clothing Type is {{ class_name }}.
The Review Title: {{ title }}
The Review Text: {{ review_text }}
>. Will the reviewer recommend this cloth?
""")


def make_prompt_column(df: pl.DataFrame) -> pl.DataFrame:
    prompts = []
    for row in df.iter_rows(named=True):
        prompts.append(make_prompt(row))

    df = df.with_columns(pl.Series(prompts).alias("prompt"))
    return df


def make_prompt(row):
    return prompt_template.render(
        age=row["Age"], class_name=row["Class Name"], title=row["Title"], review_text=row["Review Text"]
    )


train_df = make_prompt_column(train_df)
test_df = make_prompt_column(test_df)
train_df.head(3)


Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,labels,prompt
i64,i64,str,str,i64,i64,i64,str,str,str,i8,str
0,25,"""3-season skirt!""","""Adorable, well-made skirt! lined and very slimming. i had to size up b/c it runs a bit snug around t…",5,1,4,"""General""","""Bottoms""","""Skirts""",1,"""This reviewer's Age is 25. The Clothing Type is Skirts. The Review Title: 3-season skirt! The Review…"
0,39,"""Very cute""","""Love the asymmetrical hem. waist fit snugly as in perfectly. it ties in two spots with a hidden zipp…",5,1,0,"""General""","""Bottoms""","""Skirts""",1,"""This reviewer's Age is 39. The Clothing Type is Skirts. The Review Title: Very cute The Review Text:…"
0,42,"""Beautiful! fruns small for typical retailer sizing""","""I love this skirt! i wasn't sure about the mix of the back pattern with the front but it works! it i…",5,1,5,"""General""","""Bottoms""","""Skirts""",1,"""This reviewer's Age is 42. The Clothing Type is Skirts. The Review Title: Beautiful! fruns small for…"


In [5]:
# setup model and tokenizer
def setup_model_and_tokenizer():
    torch.cuda.empty_cache()
    gc.collect()

    tokenizer = AutoTokenizer.from_pretrained(cfg.gemma.model_path)
    tokenizer.add_eos_token = True
    tokenizer.padding_side = "right"  # 文末に<eos>トークンを追加
    tokenizer.pad_token = tokenizer.eos_token  # <eos>をpad_tokenとして設定

    peft_config = LoraConfig(
        r=cfg.gemma.lora_r,
        lora_alpha=cfg.gemma.lora_alpha,
        lora_dropout=cfg.gemma.lora_dropout,
        bias=cfg.gemma.lora_bias,
        inference_mode=False,
        task_type=TaskType.SEQ_CLS,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.gemma.model_path,
        num_labels=2,
        device_map="auto",
    )
    model.config.use_cache = False  # キャッシュを使用しない
    model = prepare_model_for_kbit_training(model)  # 量子化したモデルをファインチューニング可能にする
    model = get_peft_model(model, peft_config)  # モデルにLoRAを適用
    # model.print_trainable_parameters()
    return model, tokenizer


In [6]:
# tokenize function
def tokenize(sample):
    return tokenizer(sample["prompt"], padding="max_length", truncation=True)


### 学習

In [7]:
# metricをAUCに変更
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = torch.softmax(torch.tensor(preds), dim=1).numpy()
    score = roc_auc_score(labels, preds[:, 1])
    return {"auc": score}


# 実験結果格納用のディレクトリを作成
cfg.run_name = time.strftime("%Y%m%d_%H%M%S")
Path(cfg.data.results_dir).mkdir(exist_ok=True, parents=True)

y_train = train_df[cfg.target].to_numpy()
# oof = np.zeros(len(y_train))
oof_auc = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y_train)):
    # Setup model and tokenizer
    model, tokenizer = setup_model_and_tokenizer()

    # Setup dataset
    ds_train = Dataset.from_pandas(train_df[train_idx][["prompt", "labels"]].clone().to_pandas())
    ds_val = Dataset.from_pandas(train_df[val_idx][["prompt", "labels"]].clone().to_pandas())
    ds_test = Dataset.from_pandas(test_df.select("prompt").clone().to_pandas())

    ds_train = ds_train.map(tokenize).remove_columns("prompt")
    ds_val = ds_val.map(tokenize).remove_columns("prompt")
    ds_test = ds_test.map(tokenize).remove_columns("prompt")

    # Setup trainer
    output_dir = os.path.join(cfg.data.results_dir, f"fold{fold}")

    train_args = TrainingArguments(
        output_dir=output_dir,  # 学習結果の出力ディレクトリ
        fp16=cfg.gemma.fp16,  # 16ビット浮動小数点演算を使用するかどうか
        learning_rate=cfg.gemma.learning_rate,  # 学習率
        num_train_epochs=cfg.gemma.epochs,  # 学習エポック数
        per_device_train_batch_size=cfg.gemma.per_device_train_batch_size,  # デバイスあたりの訓練バッチサイズ
        per_device_eval_batch_size=cfg.gemma.per_device_eval_batch_size,  # デバイスあたりの評価バッチサイズ
        gradient_accumulation_steps=cfg.gemma.gradient_accumulation_steps,  # 勾配蓄積ステップ数
        gradient_checkpointing=True,  # 勾配チェックポイントを使用するかどうか
        report_to="none",  # レポート出力先（なし）
        evaluation_strategy="steps",  # 評価戦略（ステップごと）
        do_eval=True,  # 評価を行うかどうか
        eval_steps=cfg.gemma.steps,  # 評価を行うステップ間隔
        save_total_limit=1,  # 保存するモデルの最大数
        save_strategy="steps",  # 保存戦略（ステップごと）
        save_steps=cfg.gemma.steps,  # モデルを保存するステップ間隔
        logging_steps=cfg.gemma.steps,  # ログを出力するステップ間隔
        load_best_model_at_end=True,  # 学習終了時に最良のモデルをロードするかどうか
        lr_scheduler_type=cfg.gemma.lr_scheduler_type,  # 学習率スケジューラーの種類
        metric_for_best_model=cfg.gemma.metric,  # 最良モデルを判断するための評価指標
        greater_is_better=True,  # 評価指標が大きいほど良いかどうか
        warmup_ratio=0.1,  # ウォームアップの比率
        weight_decay=cfg.gemma.weight_decay,  # 重み減衰
        save_safetensors=True,  # SafeTensorsフォーマットで保存するかどうか
        seed=cfg.seed,  # 乱数シード
        data_seed=cfg.seed,  # データシャッフル用の乱数シード
        optim=cfg.gemma.optim,  # 最適化アルゴリズム
    )

    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Perform inference on val and test datasets
    pred_val = torch.softmax(torch.tensor(trainer.predict(ds_val).predictions), dim=1).numpy()[:, 1]
    pred_test = torch.softmax(torch.tensor(trainer.predict(ds_test).predictions), dim=1).numpy()[:, 1]

    # Save the model, predictions
    final_output_dir = f"{cfg.data.results_dir}/fold{fold}/final"
    trainer.save_model(final_output_dir)
    np.save(f"{final_output_dir}/val.npy", pred_val)
    np.save(f"{final_output_dir}/test.npy", pred_test)
    # tokenizer.save_pretrained(final_output_dir)

    # Calculate and log AUC score
    roc_auc = roc_auc_score(y_train[val_idx], pred_val)
    print(f"Fold {fold} AUC: {roc_auc}")
    oof_auc.append(roc_auc)

    # Clean up to free memory
    del model
    torch.cuda.empty_cache()
    gc.collect()


print(f"Mean AUC score across all folds: {np.mean(oof_auc)}")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/11155 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Auc
50,0.3489,0.161682,0.974521
100,0.214,0.19557,0.960731
150,0.1548,0.171763,0.976672
200,0.1435,0.150833,0.977598


Fold 0 AUC: 0.9775979776283826


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [12]:
torch.cuda.empty_cache()
gc.collect()


14184

### Sub


In [13]:
# 提出
sub_df = pl.read_csv(cfg.data.sample_submission_path)
sub_df = sub_df.with_columns(pl.Series(pred_test).alias("target"))
sub_df.write_csv(os.path.join(cfg.data.results_dir, f"{cfg.run_name}_submission.csv"))
sub_df.head()


target
f32
0.999411
0.590307
0.999469
0.175185
0.998186
