In [1]:
import os
import gc
from matplotlib.pylab import plt
import plotly.graph_objects as go
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

import torch
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from huggingface_hub import login

import openai
import config

In [None]:
openai.api_key = config.OPENAI_API_KEY
client = openai.Client()
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
wandb.login(key=config.WANDB_API_KEY)
login(token=config.HUGGINGFACE_API_KEY)

In [2]:
PROJECT = "wandb_tutorial"

MODEL = "llm-jp/llm-jp-1.3b-v1.0"
# MODE = "llm-jp/llm-jp-3-13b"
DATASET = "llm-jp/databricks-dolly-15k-ja"
TRAIN_SAMPLES = 10000
EVAL_SAMPLES = 10
EPOCHS = 5
DEVICE_MAP = "auto"
BATCH_SIZE = 8

LORA_RANK = 128
LORA_ALPHA = 256
LORA_DROPOUT = 0.1
TARGET_MODULES = ["c_attn", "c_proj", "c_fc"] # llm-jp/llm-jp-1.3b-v1.0
# TARGET_MODULES = ['gate_proj', 'up_proj', 'o_proj', 'v_proj', 'k_proj', 'q_proj', 'down_proj'] # llm-jp/llm-jp-3-13b

LEARNING_RATE = 1e-5
OPTIMIZER = "paged_adamw_32bit"
LR_SCHEDULER_TYPE = "cosine"  # 学習率スケジュール
MAX_GRAD_NORM = 0.3  # 最大法線勾配 (勾配クリッピング)
WARMUP_RATIO = 0.1  # 線形ウォームアップのステップ比率 (0から学習率まで)
WEIGHT_DECAY = 0.001  # bias/LayerNormウェイトを除く全レイヤーに適用するウェイト減衰

NAME = f"{MODEL}_lora_experiment"
NOTES = f"{MODEL}モデルに対してLoRAを適用した実験"
TAGS = [MODEL, "lora", "fine-tuning"]

# 実験の設定を定義
params = {
    "learning_rate": LEARNING_RATE,
    "architecture": MODEL,
    "dataset": DATASET,
    "train_samples": TRAIN_SAMPLES,
    "eval_samples": EVAL_SAMPLES,
    "batch_size": BATCH_SIZE,
    "epochs": EPOCHS,
    "optimizer": OPTIMIZER,
    "lora_rank": LORA_RANK,
    "lora_alpha": LORA_ALPHA,
    "lora_dropout": LORA_DROPOUT,
    "target_modules": TARGET_MODULES,
    "max_grad_norm": MAX_GRAD_NORM,
    "warmup_ratio": WARMUP_RATIO,
    "weight_decay": WEIGHT_DECAY,
    "lr_scheduler_type": LR_SCHEDULER_TYPE,
}

In [None]:
# wandbの初期化
wandb.init(
    project="wandb_tutorial", # プロジェクト名
    name=NAME, # 実験の名前
    config=params, # 設定パラメータ
    notes=NOTES, # 実験の説明
    tags=TAGS,  # タグ
)

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    quantization_config=bnb_config,
    device_map=DEVICE_MAP,
    use_cache=False
)

In [None]:
# LoRAの適用モジュールの名前を取得
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []
    
    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing 

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
    
    return layer_names

list(set(get_specific_layer_names(pretrained_model)))

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
dataset = load_dataset(DATASET)

In [9]:
def format_prompt(sample):
    instruction = f"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    sample["prompt"] = prompt
    return sample

In [10]:
dataset = dataset.map(format_prompt)

In [11]:
dataset = dataset.remove_columns(['instruction', 'context', 'response', 'category'])

In [12]:
train_samples = dataset["train"].select(range(0,params["train_samples"]))
eval_samples  = dataset["train"].select(range(params["train_samples"],params["train_samples"]+params["eval_samples"]))

In [13]:
lora_config = LoraConfig(
    r=params["lora_rank"],
    lora_alpha=params["lora_alpha"],
    target_modules=params["target_modules"],
    lora_dropout=params["lora_dropout"],
    bias="none",
    task_type="CAUSAL_LM"
)

In [14]:
training_args = TrainingArguments(
    output_dir='./sample_output',
    auto_find_batch_size=True,
    learning_rate=params["learning_rate"],
    num_train_epochs=params["epochs"],
    per_device_train_batch_size=params["batch_size"],
    per_device_eval_batch_size=params["batch_size"],
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    optim=params["optimizer"],
    lr_scheduler_type=params["lr_scheduler_type"],
    max_grad_norm=params["max_grad_norm"],
    warmup_ratio=params["warmup_ratio"],
    weight_decay=params["weight_decay"],
    report_to="wandb"
)

In [None]:
trainer = SFTTrainer(
    model=pretrained_model,
    args=training_args,
    eval_dataset=eval_samples,
    train_dataset=train_samples,
    peft_config=lora_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    max_seq_length=256,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
trainer.train()

In [17]:
finetuned_model = trainer.model

In [18]:
# MTbenchのデータを読み込む
import json
question_data = []
with open("data/question_full.jsonl", "r") as fin:
    for line in fin:
        data = json.loads(line)
        question_data.append(data)

gpt4_answer_data = []
with open("data/gpt-4.jsonl", "r") as fin:
    for line in fin:
        data = json.loads(line)
        gpt4_answer_data.append(data)

evaluation_dataset = []
for question, answer in zip(question_data, gpt4_answer_data):
    for question_turn, answer_turn in zip(question["turns"], answer["choices"][0]["turns"]):
        evaluation_dataset.append({"question": question_turn, "answer": answer_turn, "category": question["category"]})

In [None]:
evaluation_dataset

In [20]:
judge_prompt_data = []
with open("prompt/judge_ja_prompts.jsonl", "r") as fin:
    for line in fin:
        data = json.loads(line)
        judge_prompt_data.append(data)

In [21]:
from pydantic import BaseModel

class EvaluationScore(BaseModel):
    evaluation_reason: str
    score: int


In [22]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def mtbench_score(question: str, answer: str, model_name: str, reference_answer: str="", **kwargs) -> dict:
    if reference_answer:
        judge_prompt = judge_prompt_data[1]["prompt_template"]
        judge_prompt = judge_prompt.format(question=question, answer=answer, ref_answer_1=reference_answer)
    else:
        judge_prompt = judge_prompt_data[0]["prompt_template"]
        judge_prompt = judge_prompt.format(question=question, answer=answer)
    response = client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {"role": "user", "content": judge_prompt},
        ],
        response_format=EvaluationScore,
        **kwargs
    )
    return {"score": response.choices[0].message.parsed}


In [23]:
def generate_text(prompt, tokenizer, model):
    try:
        input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to(model.device)
        output_ids = model.generate(
            **input_ids,
            max_new_tokens=200,
            do_sample=True
        )
        generated_text = tokenizer.batch_decode(output_ids[:, input_ids['input_ids'].shape[1]:], skip_special_tokens=True)[0].strip()
    except:
        input_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt').to(model.device)
        output_ids = model.generate(
            input_ids,
            max_new_tokens=200,
            do_sample=True,
        )
        generated_text = tokenizer.decode(output_ids[0])
    return generated_text

In [None]:
results = []
for sample in evaluation_dataset:
    generated_text = generate_text(sample["question"], tokenizer, finetuned_model)
    result = mtbench_score(question=sample["question"], answer=generated_text, model_name="gpt-4o", reference_answer=sample["answer"], temperature=0.01)
    results.append({
        "入力": sample["question"],
        "生成結果": generated_text,
        "正解文": sample["answer"],
        "MTBench_判定理由": result["score"].evaluation_reason,
        "MTBench_スコア": result["score"].score,
        "MTBench_カテゴリ": sample["category"]
    })
print(results)

In [25]:
import pandas as pd
df = pd.DataFrame(results)
df["model_name"] = MODEL
df["実験名"] = NAME

In [None]:
artifact = wandb.use_artifact("mtbench_score_artifact:latest", type="dataset")
artifact_dir = artifact.download()
with open(f"{artifact_dir}/mtbench_score_key.table.json") as f:
    tjs = json.load(f)
output_table = wandb.Table.from_json(json_obj=tjs, source_artifact=artifact)
output_df = pd.DataFrame(data=output_table.data, columns=output_table.columns)
output_df = pd.concat([output_df, df], ignore_index=True)
artifact = wandb.Artifact("mtbench_score_artifact", type="dataset")
artifact.add(wandb.Table(dataframe=output_df), "mtbench_score_key")  
# あるいは df.to_csv() してファイルとして add_file() するなど
wandb.log_artifact(artifact)

In [27]:
radar_df = output_df.groupby(["MTBench_カテゴリ", "実験名"])["MTBench_スコア"].mean().reset_index()

In [28]:
fig = go.Figure()
for experiment_name in radar_df["実験名"].unique():
    radar_df_ = radar_df[radar_df["実験名"] == experiment_name]
    categories = radar_df_["MTBench_カテゴリ"].unique().tolist()
    values = radar_df_["MTBench_スコア"].values.tolist()
    fig.add_trace(
    go.Scatterpolar(
        r=values,
        theta=categories,
        fill='toself',   # 内側を塗りつぶし
            name=experiment_name
        )
    )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]  # 値に合わせてrangeを調整
        )
    ),
    showlegend=True
)
wandb.log({"MTBench_スコア": fig})