In [None]:
# reference https://github.com/QwenLM/Qwen/blob/main/finetune.py#L338

from dataclasses import dataclass, field
import json
import math
import logging
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '5'
from typing import Dict, Optional, List
import torch
from torch.utils.data import Dataset
import transformers
from transformers import get_cosine_schedule_with_warmup
from transformers import Trainer, GPTQConfig, deepspeed, DataCollatorForSeq2Seq, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.trainer_pt_utils import LabelSmoother
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from accelerate.utils import DistributedType
from datasets import load_dataset
from lion_pytorch import Lion

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

In [None]:
MODEL_PATH = "ycchen/yc-test1"
NUM_EPOCH = 5
BATCH_SIZE = 1
GRAD_ACC_STEPS = 32
MAX_LEN = 2048

In [None]:
oasst_seed_dataset = load_dataset('ycchen_submission_3_dataset', split='train')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, use_fast=True)
tokenizer.pad_token = '<|extra_21|>'
tokenizer.pad_token_id = tokenizer('<|extra_21|>').input_ids[0]

In [None]:
def preprocess(
    source,
    system_message: str = "Answer in the style of an AI Assistant."
):  
    if source['source'] == 'oasst':
        system_message = "Answer in the style of an AI Assistant.\n"
    elif source['source'] == 'arc':
        system_message = "The following are multiple choice questions (with answers)\n"
    else:
        system_message =  "Answer with knowledge from web search.\n"
    
    input_ids, targets = [], []
    system_text = system_message + '\n'
    system = tokenizer(system_text).input_ids
    input_ids += system
    targets += [IGNORE_TOKEN_ID] * len(system)

    for i_round in range(0, len(source['conversations'])//2, 2):
        question_text = source['conversations'][i_round]
        answer_text =  source['conversations'][i_round+1]

        question = tokenizer(question_text).input_ids
        input_ids += question
        targets += [IGNORE_TOKEN_ID] * len(question)

        answer = tokenizer(answer_text).input_ids
        input_ids += answer
        targets += answer

    input_ids = input_ids[:MAX_LEN]
    targets = targets[:MAX_LEN]

    input_ids = torch.tensor(input_ids, dtype=torch.int64)
    targets = torch.tensor(targets, dtype=torch.int64)

    return dict(
        input_ids=input_ids,
        labels=targets,
    )

In [None]:
train_dataset = oasst_seed_dataset.map(preprocess, remove_columns=['conversations', 'source'])

In [None]:
train_dataset

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "cuda:0",
    trust_remote_code=True,
    use_flash_attn=False,
).eval()

In [None]:
use_gradient_checkpointing = True
model = prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=use_gradient_checkpointing,
)

In [None]:
target_modules = []
for i_layer in range(0, 40):
    target_modules.extend([
        f"{i_layer}.attn.c_attn", f"{i_layer}.attn.c_proj",
        f"{i_layer}.mlp.c_proj", f"{i_layer}.mlp.w1", f"{i_layer}.mlp.w2",
    ])

In [None]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

# Print peft trainable params
model.print_trainable_parameters()

In [None]:
optimizer = Lion(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=0.1)
num_samples = len(train_dataset)
num_training_steps = num_samples * NUM_EPOCH / (BATCH_SIZE * GRAD_ACC_STEPS)
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_training_steps=num_training_steps,
    num_warmup_steps=num_training_steps*0.06,
)

In [None]:
train_args = TrainingArguments(
    "ycchen_submission_3_qwen_qlora",
    num_train_epochs=NUM_EPOCH,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    ddp_find_unused_parameters=False,
    logging_steps=1,
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    optimizers=(optimizer, lr_scheduler),
    args=train_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
)

In [None]:
trainer.train()