## 1. 加载数据集

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("gem/viggo", split="train")
eval_dataset = load_dataset("gem/viggo", split="validation")
test_dataset = load_dataset("gem/viggo", split="test")

print(train_dataset)
print(eval_dataset)
print(test_dataset)

ModuleNotFoundError: No module named 'datasets'

## 2. 加载基模型

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = ""

# 量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_type = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                             quantization_config = bnb_config)

## 3. 加载 Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length = 512,
    padding_side = "left",
    add_eos_token = True
)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
result = tokenizer(prompt,
                   truncation = True,
                   max_length = 512,
                   padding = "max_length")

result["labels"] = result["input_ids"].copy()

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
                    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
                    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']
                    
                    ### Target sentence:
                    {data_point["target"]}
                    
                    ### Meaning representation:
                    {data_point["meaning_representation"]}
                  """
    return tokenizer(full_prompt)

## 4. 对train和eval数据集进行tokenzier

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)

tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

In [None]:
# 查看样本

print(tokenized_train_dataset[4]["input_ids"])

In [None]:
print(len(tokenized_train_dataset[4]["input_ids"]))

## 5. 基于 base model 进行测试

In [None]:
# 查看 test 数据集样本

print("目标语句: \n", test_dataset[1]["target"])

print("意义表示: \n", test_dataset[1]["meaning_representation"])


In [None]:
eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

In [None]:
# 重新初始化 tokenizer，这样它就不会添加 padding 或 eos token

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token = True,
)

model_input = eval_tokenizer(eval_prompt, return_tensors = "pt").to("cuda")

model.eval()

with torch.no_grad():
    # 模型推理
    result = model.generate(**model_input, max_new_tokens=256)
    # 解码
    result = eval_tokenizer.decode(result[0], skip_special_tokens=True)
    print(result)

## 6. 配置 LoRA

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpoint_enable()

model = prepare_model_for_kbit_training(model)

In [3]:
def print_trainable_parameters():
    '''计算训练的参数量'''
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"训练参数量：{trainable_params} || 所有参数量：{all_param} || 可训练参数量比例：{100 * trainable_params / all_param}"
    )

In [None]:
# 打印模型

print(model)

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_moduels = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM",
)

model = get_peft_model(model, config)

print_trainable_parameters(model)

In [None]:
# 打印模型

print(model)

## 7. 模型训练

In [None]:
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
import transformers
from datetime import datetime

project = "llama-3-finetune"
base_model_name = ""
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    # 指定要训练的模型
    model = model,
    # 指定训练数据集
    train_dataset = tokenized_train_dataset,
    # 指定验证数据集
    eval_dataset = tokenized_val_dataset,
    # 训练参数配置
    args = transformers.TrainingArguments(
        output_dir = output_dir, # 训练输出的目录
        warmup_steps = 5, # 训练过程中的预热步骤数 
        # 解释：
        # warmup_steps ：在训练的初始阶段，学习率从一个较低的值逐步增加到设定的学习率。
        #                预热步骤的作用是避免模型在一开始就收到较大的梯度更新，从而有助于稳定训练过程。
        per_device_train_batch = 2, # 训练批次大小
        gradient_checkpointing = True, # 是否开启梯度检查点以节省内存
        # 解释：
        # gradient_checkpointing ： 这是一种技术，允许在训练过程中节省显存。
        #                          具体来说，它会在前向传播时保存某些中间结果，而不是所有中间结果，从而减少显存占用量。然后在反向传播时，必要时重新计算这些中间结果。
        gradient_accumulation_steps = 4, # 梯度累积的步数，实际 batch size = per_device_train_batch_size * gradient_accumulation_steps
        max_steps = 100, # 最大训练步数，1000,5000等
        learning_rate = 2.5e-5, # 学习率
        logging_steps = 50, # 每 50 步记录一次日志
        bf16 = True, # 使用 bfloat16 精度进行训练
        optim = "paged_adamw_8bit", # 使用8-bit的AdamW优化器
        # 解释：
        # paged_adamw_8bit ：这是一种优化器的实现，将参数和梯度压缩到8-bit表示，以减少内存和计算需求。
        #                    Paged表示则是指优化器分页处理数据，以进一步优化内存使用。
        logging_dir = "./logs", # 日志存储目录
        save_strategy = "steps", # 模型保存策略：每隔一定步数保存一次
        save_steps = 50, # 每50步保存一次模型检查点
        evaluation_strategy = "steps", # 评估策略：每隔一定步数进行评估
        eval_steps = 50, # 每50步进行一次评估
        do_eval = True, # 是否在训练结束后进行评估
        report_to = "wandb",
        run_name = f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"  # W&B 运行名称，包含当前时间戳
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), # 数据整理器
)

model.config.use_cache = False # 禁用缓存以避免警告。推理时请重新启用


In [None]:
trainer.train() # 开始训练

## 8. 基于base model和LoRA model进行推理

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = ""

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config = bnb_config,
    device_map = "auto",
    trust_remote_code = True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token = True,
    trust_remote_code = True,
)

In [None]:
# 加载 QLoRA adapter

from peft import PeftModel

best_qlora_checkpoint = ""

ft_model = PeftModel.from_pretrained(base_model,
                                     best_qlora_checkpoint)

In [None]:
eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))