In [4]:
# const
DATASET = 'Flmc/DISC-Med-SFT'
BASE_MODEL = 'Qwen/Qwen2.5-0.5B'
SFT_MODEL = 'qwen2.5-0.5b-disc-med-sft'
MAX_TENSOR_DIM = 68
MAX_SIZE = 1000
MAX_BATCH_SIZE = 128
MIN_BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 2e-3
TRAIN_EPOCHS = 32

In [5]:
from datasets import load_dataset

disc_med_sft = load_dataset(DATASET, cache_dir='./cache')['train']

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
# peek
disc_med_sft

Dataset({
    features: ['_id', 'source', 'conversation'],
    num_rows: 464898
})

In [7]:
# preprocess
from sklearn.model_selection import train_test_split

inputs = list[str]()
labels = list[str]()
conversation_pair = list[tuple]()
conversations_trunks = disc_med_sft['conversation']
conversations_trunk = list[dict]()
for conversation_trunk in conversations_trunks:
    for a_conversation in conversation_trunk:
        conversations_trunk.append(a_conversation)
conversations = list[dict]()
for index, conversation in enumerate(conversations_trunk):
    if (
            index + 1 < len(conversations_trunk)
            and conversation['role'] == 'user' 
            and conversations_trunk[index + 1]['role'] == 'assistant'
    ):
        conversations.append((conversation, conversations_trunk[index + 1]))
for conversation in conversations[:MAX_SIZE]:
    inputs.append(conversation[0]['content'])
    labels.append(conversation[1]['content'])
print(f'{len(inputs)} inputs, {len(labels)} labels')
train_inputs, eval_inputs, train_labels, eval_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42, shuffle=True)
print(f'{len(train_inputs)} train inputs, {len(train_labels)} train labels')
print(f'{len(eval_inputs)} eval inputs, {len(eval_labels)} eval labels')
{
    'train inputs': train_inputs[:10],
    'train labels': train_labels[:10],
    'eval inputs': eval_inputs[:10],
    'eval labels': eval_labels[:10]
}

1000 inputs, 1000 labels
800 train inputs, 800 train labels
200 eval inputs, 200 eval labels


{'train inputs': ['我想咨询一下做手术能好吗？手术怎么做，术后有后遗症吗？术后胃能不能长的和以前一样吗',
  '排尿没有困难，就是口子太小龟头露不出来，用力撸还有点痛。',
  '孙老师，您上次看过之后病好了，这两天肚子两侧又出现了微微凸起的红疙瘩，比较痒，依泉润肤霜我每天都抹2～3次，口服的药喝完了，外涂的药还有，我是不是再去医院找您看一下啊。孙老师，肚子两侧大概就是图片中的这个样子，我需要再喝点什么药吗？谢谢孙老师。',
  '谢谢您的解答。',
  '甲状腺结节。甲状腺结节微小钙化，血流丰富，定为4级，建议穿刺活检',
  '今天吐了两次，要不要去医院',
  '我已经去喉科检查过了，还做过消融治疗，但咳嗽问题仍然存在。',
  '非常感谢医生的建议和祝福！我会按照您的指导去做的。',
  '月经期推迟不准时，怀孕了右侧卵巢囊肿怎么办呢，现在呼吸有点难受',
  '谢谢医生的解答。'],
 'train labels': ['我了解您的担忧，手术是一种治疗胃下垂的方法，但是手术是否适合您，还需要进一步的评估。手术的具体方法和术后的后遗症会因个体差异而有所不同，所以我无法直接给出明确的答案。建议您咨询普外科医生，他们会根据您的具体情况进行评估，并给出最合适的治疗建议。手术后，胃的功能可能会有所改变，但是具体情况还需要医生根据您的病情来评估。',
  '根据您的描述，这可能是包茎的症状。包茎是一种常见的男性生殖器问题，即包皮无法完全缩回露出龟头。建议您带孩子去医院进一步检查确认诊断。',
  '根据您的描述，您可能出现了特应性皮炎的复发症状。不用担心，您可以继续使用依泉润肤霜，并在其基础上加上艾洛松软膏进行外用。这种药膏可以帮助缓解瘙痒和减轻炎症。不过，由于我无法看到您提到的图片，建议您再次使用外涂药物前，先来医院让我看一下，以确保正确的治疗方案。',
  '不客气，希望我的回答对您有所帮助。如果还有其他问题，请随时向我提问。祝您和孩子健康！',
  '对于您的情况，如果您已经决定进行手术，是否需要进行穿刺活检这个在不同医院可能会有不同的意见。对于您这种情况，如果能够通过穿刺活检进行病理确诊，那么在手术前进行一次穿刺是可以的。手术过程中不需要等待快速冰冻切片结果。如果是穿刺细胞学检查，个人认为手术过程中进行快速冰冻切片就足够了。如果您还在犹豫是

In [8]:
# find batch size from train and eval set size
from sympy import factorint
from random import choice

ADAPTIVE_TRAIN_BATCH_SIZE_SET = factorint(len(train_inputs), multiple=True, limit=MAX_BATCH_SIZE)
ADAPTIVE_TRAIN_BATCH_SIZE = choice(ADAPTIVE_TRAIN_BATCH_SIZE_SET)
while not (MIN_BATCH_SIZE <= ADAPTIVE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS <= MAX_BATCH_SIZE):
    ADAPTIVE_TRAIN_BATCH_SIZE = choice(ADAPTIVE_TRAIN_BATCH_SIZE_SET)
ADAPTIVE_EVAL_BATCH_SIZE_SET = factorint(len(eval_inputs), multiple=True, limit=MAX_BATCH_SIZE)
ADAPTIVE_EVAL_BATCH_SIZE = choice(ADAPTIVE_EVAL_BATCH_SIZE_SET)
while not (MIN_BATCH_SIZE <= ADAPTIVE_EVAL_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS <= MAX_BATCH_SIZE):
    ADAPTIVE_EVAL_BATCH_SIZE = choice(ADAPTIVE_EVAL_BATCH_SIZE_SET)
{
    'train_batch_size_set': ADAPTIVE_TRAIN_BATCH_SIZE_SET,
    'eval_batch_size_set': ADAPTIVE_EVAL_BATCH_SIZE_SET,
    'train batch size': ADAPTIVE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS, 
    'eval batch size': ADAPTIVE_EVAL_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
}

{'train_batch_size_set': [2, 2, 2, 2, 2, 5, 5],
 'eval_batch_size_set': [2, 2, 2, 5, 5],
 'train batch size': 80,
 'eval batch size': 32}

In [9]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, cache_dir='./cache')
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir='./cache')
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [10]:
# prepare train data
from datasets import Dataset

max_len = MAX_TENSOR_DIM
tokenized_train_inputs = tokenizer(
    train_inputs, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_train_labels = tokenizer(
    train_labels, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_eval_inputs = tokenizer(
    eval_inputs, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
tokenized_eval_labels = tokenizer(
    eval_labels, 
    padding='max_length',
    max_length=max_len,
    truncation=True, 
    return_tensors="pt"
)
train_dataset_raw = {
    'input_ids': tokenized_train_inputs['input_ids'],
    'attention_mask': tokenized_train_inputs['attention_mask'],
    'labels': tokenized_train_labels['input_ids']
}
eval_dataset_raw = {
    'input_ids': tokenized_eval_inputs['input_ids'],
    'attention_mask': tokenized_eval_inputs['attention_mask'],
    'labels': tokenized_eval_labels['input_ids']
}
train_dataset = Dataset.from_dict(train_dataset_raw)
eval_dataset = Dataset.from_dict(eval_dataset_raw)
{'train_dataset': train_dataset_raw,'eval_dataset': eval_dataset_raw}

{'train_dataset': {'input_ids': tensor([[104100, 100703, 100158,  ..., 151643, 151643, 151643],
          [ 59956, 102395,  80443,  ..., 151643, 151643, 151643],
          [100685, 101049,   3837,  ..., 102201,  99486,  45930],
          ...,
          [ 80443,  99190, 100654,  ..., 151643, 151643, 151643],
          [102570, 103998, 111423,  ..., 151643, 151643, 151643],
          [106287,   3837,  35946,  ..., 151643, 151643, 151643]]),
  'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]]),
  'labels': tensor([[ 35946,  99794, 101214,  ..., 101898,   1773, 104160],
          [100345, 101214,  53481,  ..., 151643, 151643, 151643],
          [100345, 101214,  53481,  ...,  47815, 100286, 104459],
          ...,
          [100345, 101214,  53481,  ...,   3837, 105705,  99727],
          [ 16530

In [None]:
# train
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

training_args = TrainingArguments(
    use_cpu=True,
    output_dir="./results",
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=ADAPTIVE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=ADAPTIVE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    log_level='info',
    logging_dir='./logs',
    logging_steps=16,
    logging_strategy='steps',
    eval_strategy="steps",
    eval_steps=16,
    save_strategy="steps",
    save_steps=16,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
).train()

In [None]:
# save
lora_model.save_pretrained(f'./model/{SFT_MODEL}')

In [None]:
# test
model = AutoModelForCausalLM.from_pretrained(f'./model/{SFT_MODEL}')
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe('足部骨折。你好大夫，谢谢您百忙中的时间。请问骨折对位可以吗？内侧契骨是稍有错位吗？') # 您好，我很高兴能为您提供帮助，根据您的描述，骨折的对位情况还可以。但是，为了更准确地评估情况，我是否可以看一下术前的片子呢？