In [ ]:
# const
DATASET = 'Flmc/DISC-Med-SFT'
BASE_MODEL = 'Qwen/Qwen2.5-0.5B'
SFT_MODEL = 'qwen2.5-0.5b-disc-med-sft'
MAX_TENSOR_DIM = 68
MAX_SIZE = 1000
MAX_BATCH_SIZE = 128
MIN_BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 2e-3
TRAIN_EPOCHS = 32

In [None]:
from datasets import load_dataset

disc_med_sft = load_dataset(DATASET, cache_dir='./cache')['train']

In [None]:
# peek
disc_med_sft

In [None]:
# preprocess
from sklearn.model_selection import train_test_split

inputs = list[str]()
labels = list[str]()
conversation_pair = list[tuple]()
conversations_trunks = disc_med_sft['conversation']
conversations_trunk = list[dict]()
for conversation_trunk in conversations_trunks:
    for a_conversation in conversation_trunk:
        conversations_trunk.append(a_conversation)
conversations = list[dict]()
for index, conversation in enumerate(conversations_trunk):
    if (
            index + 1 < len(conversations_trunk)
            and conversation['role'] == 'user' 
            and conversations_trunk[index + 1]['role'] == 'assistant'
    ):
        conversations.append((conversation, conversations_trunk[index + 1]))
for conversation in conversations[:MAX_SIZE]:
    inputs.append(conversation[0]['content'])
    labels.append(conversation[1]['content'])
print(f'{len(inputs)} inputs, {len(labels)} labels')
train_inputs, eval_inputs, train_labels, eval_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42, shuffle=True)
print(f'{len(train_inputs)} train inputs, {len(train_labels)} train labels')
print(f'{len(eval_inputs)} eval inputs, {len(eval_labels)} eval labels')
{
    'train inputs': train_inputs[:10],
    'train labels': train_labels[:10],
    'eval inputs': eval_inputs[:10],
    'eval labels': eval_labels[:10]
}

In [None]:
# find batch size from train and eval set size
from sympy import factorint
from random import choice

ADAPTIVE_TRAIN_BATCH_SIZE_SET = factorint(len(train_inputs), multiple=True, limit=MAX_BATCH_SIZE)
ADAPTIVE_TRAIN_BATCH_SIZE = choice(ADAPTIVE_TRAIN_BATCH_SIZE_SET)
while not (MIN_BATCH_SIZE <= ADAPTIVE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS <= MAX_BATCH_SIZE):
    ADAPTIVE_TRAIN_BATCH_SIZE = choice(ADAPTIVE_TRAIN_BATCH_SIZE_SET)
ADAPTIVE_EVAL_BATCH_SIZE_SET = factorint(len(eval_inputs), multiple=True, limit=MAX_BATCH_SIZE)
ADAPTIVE_EVAL_BATCH_SIZE = choice(ADAPTIVE_EVAL_BATCH_SIZE_SET)
while not (MIN_BATCH_SIZE <= ADAPTIVE_EVAL_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS <= MAX_BATCH_SIZE):
    ADAPTIVE_EVAL_BATCH_SIZE = choice(ADAPTIVE_EVAL_BATCH_SIZE_SET)
{
    'train_batch_size_set': ADAPTIVE_TRAIN_BATCH_SIZE_SET,
    'eval_batch_size_set': ADAPTIVE_EVAL_BATCH_SIZE_SET,
    'train batch size': ADAPTIVE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, 
    'eval batch size': ADAPTIVE_EVAL_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
}

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, cache_dir='./cache')
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir='./cache')
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

In [None]:
# prepare train data
from datasets import Dataset

max_len = MAX_TENSOR_DIM
tokenized_train_inputs = tokenizer(
    train_inputs, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_train_labels = tokenizer(
    train_labels, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_eval_inputs = tokenizer(
    eval_inputs, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_eval_labels = tokenizer(
    eval_labels, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
train_dataset_raw = {
    'input_ids': tokenized_train_inputs['input_ids'],
    'attention_mask': tokenized_train_inputs['attention_mask'],
    'labels': tokenized_train_labels['input_ids']
}
eval_dataset_raw = {
    'input_ids': tokenized_eval_inputs['input_ids'],
    'attention_mask': tokenized_eval_inputs['attention_mask'],
    'labels': tokenized_eval_labels['input_ids']
}
train_dataset = Dataset.from_dict(train_dataset_raw)
eval_dataset = Dataset.from_dict(eval_dataset_raw)
{'train_dataset': train_dataset_raw,'eval_dataset': eval_dataset_raw}

In [None]:
# train
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

training_args = TrainingArguments(
    use_cpu=True,
    output_dir="./results",
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=ADAPTIVE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=ADAPTIVE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    log_level='info',
    logging_dir='./logs',
    logging_steps=16,
    logging_strategy='steps',
    eval_strategy="steps",
    eval_steps=16,
    save_strategy="steps",
    save_steps=16,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
).train()

In [None]:
# save
lora_model.save_pretrained(f'./model/{SFT_MODEL}')

In [None]:
# test
model = AutoModelForCausalLM.from_pretrained(f'./model/{SFT_MODEL}')
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe('足部骨折。你好大夫，谢谢您百忙中的时间。请问骨折对位可以吗？内侧契骨是稍有错位吗？') # 您好，我很高兴能为您提供帮助，根据您的描述，骨折的对位情况还可以。但是，为了更准确地评估情况，我是否可以看一下术前的片子呢？