In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import string
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
from peft import LoraConfig
import torch

In [2]:
## モデル名

model_name = "cyberagent/calm2-7b-chat"

In [3]:
## トークナイザーのインポート

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast = True
)

In [4]:
## モデルのインポート
## リソースが足りないので、1/8量子化
## ファインチューニングする際のリソースを確保するために、多少精度を犠牲にする

quantization_config = BitsAndBytesConfig(load_in_4bit = True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = quantization_config,
    torch_dtype = torch.float32
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
request_key = "USER:"
response_key = "ASSISTANT:"
chat_template = string.Template(
    "USER:${user}\nASSISTANT:${assistant}"
)

In [6]:
sample_text = chat_template.safe_substitute({
    "user": "人気タレント・タモリの本名は何でしょう？",
     "assistant": "" # AIに回答させるために空けておく
})

In [7]:
def generate(model, text):
    input_ids = tokenizer.encode(
        text,
        return_tensors = 'pt',
        add_special_tokens = True
    ).to(model.device)
    output_ids = model.generate(
        input_ids,
        max_new_tokens = 100,
        # do_sample = True,
        # temperature = 0.8,
    )
    print(
        tokenizer.decode(
            output_ids[0],
            skip_special_tokens = True
        )
    )

In [8]:
%%time
generate(model, sample_text)

USER:人気タレント・タモリの本名は何でしょう？
ASSISTANT: タモリの本名は森田一義です。
CPU times: user 688 ms, sys: 205 ms, total: 893 ms
Wall time: 892 ms


In [9]:
def update_serifu(example):
    example['output'] = "ご質問いただきありがとうございます。回答いたしますと、「{0}」といったところでしょうか。他にも疑問がございましたらご相談ください。".format(example['output'])
    return example

In [10]:
## 外部のデータセットをインポート

rs = "izumi-lab/llm-japanese-dataset"
datasets = load_dataset(
    rs,
    split = 'train'
)
datasets = datasets.train_test_split(test_size = 0.1)
train_datasets = datasets['train'].select(range(1000)).map(update_serifu)
eval_datasets = datasets['test'].select(range(100)).map(update_serifu)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
## 学習データセットの整形プロセス

def update_datasets(example):
    output_texts = []
    for instruction_, input_, output_ in zip(example['instruction'], example['input'], example['output']):
        text = chat_template.safe_substitute({
            "user": "{0}\n{1}".format(instruction_, input_),
            "assistant": output_
        })
        output_texts.append(text)
    return output_texts

In [12]:
## ??

collator = DataCollatorForCompletionOnlyLM(
    instruction_template = tokenizer.encode(request_key, add_special_tokens = False),
    response_template = tokenizer.encode(response_key, add_special_tokens = False),
    tokenizer = tokenizer
)

In [13]:
peft_config = LoraConfig(
    peft_type = 'LORA',
    task_type = 'CAUSAL_LM',
    r = 8,
    target_modules = 'all-linear',
    lora_alpha = 8,
    lora_dropout = 0.0,
)

In [14]:
## アダプタの付与
## 量子化したモデルをそのままファインチューニングできないため
## 全体チューニングではなく、出力換算するための行列を修飾する(Lora)
## 実際にtrainされるのは行列の部分だけ

model.add_adapter(peft_config)

In [15]:
%%time
generate(model, sample_text)

USER:人気タレント・タモリの本名は何でしょう？
ASSISTANT: タモリの本名は森田一義です。
CPU times: user 848 ms, sys: 24.5 ms, total: 872 ms
Wall time: 893 ms


In [16]:
%%time

## 訓練オブジェクトの定義

trainer = SFTTrainer(
    model,
    train_dataset = train_datasets,
    eval_dataset = eval_datasets,
    args = SFTConfig(
        output_dir = "./tmp",
        eval_strategy = "epoch",
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        learning_rate = 1e-5,
        num_train_epochs = 3.0,
        lr_scheduler_type = "linear",
        warmup_ratio = 0.0,
        logging_strategy = "epoch",
        save_strategy = "epoch",
        report_to = "all",
        bf16 = True,
        max_seq_length = 1024,
    ),
    formatting_func = update_datasets,
    data_collator = collator,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CPU times: user 635 ms, sys: 73.8 ms, total: 708 ms
Wall time: 381 ms


↑警告文に従い後で修正

In [17]:
%%time

## 訓練実行

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6071,1.012223
2,0.9161,0.974217
3,0.8757,0.967745




CPU times: user 5min 25s, sys: 2min 31s, total: 7min 57s
Wall time: 8min


TrainOutput(global_step=750, training_loss=1.132955322265625, metrics={'train_runtime': 480.4872, 'train_samples_per_second': 6.244, 'train_steps_per_second': 1.561, 'total_flos': 1.27874784657408e+16, 'train_loss': 1.132955322265625, 'epoch': 3.0})

In [18]:
%%time
generate(model, sample_text)

USER:人気タレント・タモリの本名は何でしょう？
ASSISTANT:タモリの本名は「森田一義」です。
CPU times: user 991 ms, sys: 29.6 ms, total: 1.02 s
Wall time: 1.02 s
