# step1:导入依赖

In [1]:
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb 
wandb.login(key="1b8adc705fb9b3e125c05f15107ad7c22c830811")
wandb.init(project="unslothDeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit-cMedQA2-Qlora")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\21205\_netrc
[34m[1mwandb[0m: Currently logged in as: [33m3407941284[0m ([33m3407941284-hdu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# step2：数据集加载

In [3]:
ds = load_from_disk("../data/cMedQA2/deduplicate_neg")
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 188490
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 7527
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 7552
    })
})

In [4]:
ds["train"][:2]

{'question': ['不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。',
  '不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。'],
 'answer': ['B超属于超声波经常检查是不好的而且也没有必要经常检查的一般怀孕两个月检查一下怀孕五个月检查一下快出生时在检查就可以还有就是不舒服检查就可以的',
  'b超切实有一定的辐射，而且小孩比较的娇嫩，容易受辐射影响发育。宝宝尽量不要做b超，但是在胎儿期有母体的保护，所以不要担心，有必要的话一定要做。']}

# step3:数据集预处理

In [2]:
tokenizer = AutoTokenizer.from_pretrained(
    "D:/study/LLM/unslothDeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit-cMedQA2-Qlora/model/unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit",
    padding_side="right",
    use_fast=True
)
tokenizer.pad_token = tokenizer.eos_token  # 确保设置pad_token


In [6]:
def medical_process_func(examples):
    MAX_LENGTH = 512  # 医疗问答较长
    inputs = []
    outputs = []
    
    # 构建医疗问答格式
    for q, a in zip(examples["question"], examples["answer"]):
        inputs.append(f"Human: 你是一名医生，请回答以下问题：{q.strip()}\n\nAssistant: ")
        outputs.append(f"{a.strip()}{tokenizer.eos_token}")
    
    # 批量编码
    model_inputs = tokenizer(inputs, add_special_tokens=False, padding=False, truncation=False)
    labels = tokenizer(outputs, add_special_tokens=False, padding=False, truncation=False)
    
    # 创建结果字典
    full_input_ids = []
    full_attention_masks = []
    full_labels = []
    
    # 拼接输入输出
    for input_ids, label_ids in zip(model_inputs["input_ids"], labels["input_ids"]):
        full_ids = input_ids + label_ids
        attention_mask = [1] * len(full_ids)
        labels = [-100] * len(input_ids) + label_ids
        
        # 截断处理
        if len(full_ids) > MAX_LENGTH:
            full_ids = full_ids[:MAX_LENGTH]
            attention_mask = attention_mask[:MAX_LENGTH]
            labels = labels[:MAX_LENGTH]
        
        full_input_ids.append(full_ids)
        full_attention_masks.append(attention_mask)
        full_labels.append(labels)
    
    # 返回一个字典，每个键是数据集的列名
    return {
        "input_ids": full_input_ids,
        "attention_mask": full_attention_masks,
        "labels": full_labels
    }






In [7]:
tokenized_ds = ds.map(
    medical_process_func,
    batched=True,
    remove_columns=ds["train"].column_names,
    batch_size=100  # 加速处理
)

In [8]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 188490
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7527
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7552
    })
})

# step4:模型加载

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    "D:/study/LLM/unslothDeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit-cMedQA2-Qlora/model/unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit",
     device_map="auto",
    use_cache=False  # 梯度检查点需要
)

In [24]:
prompt = "Human: 不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。\n\nAssistant: "
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,  # 允许回答更长
    temperature=1.0,  # 让回答更加自由
    top_p=0.9,  # 让模型考虑更多可能性
    top_k=50,  # 增加多样性
    repetition_penalty=1.2,  # 防止重复
    no_repeat_ngram_size=3,  # 避免生成相同的三元组短语
    eos_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Human: 不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。

Assistant: 根据你的描述看起来你可能是在关心怀孯时B超的问题了。如果怀孾时间较长的话，一般每两个星期进行一次B超即可，这样可以防止出现发育异常的情况。但也不能盲目多次的去医院，如果发现不正常就及时处理一下


In [7]:
from peft import PeftModel

model_qlora = PeftModel.from_pretrained(model, "../output/medical_lora_adapter")
print("✅ 成功加载 LoRA 适配器！")


✅ 成功加载 LoRA 适配器！


In [25]:
prompt = "Human: 不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。\n\nAssistant: "
inputs = tokenizer(prompt, return_tensors="pt").to(model_qlora.device)

outputs = model_qlora.generate(
    **inputs,
    max_new_tokens=1024,  # 允许回答更长
    temperature=1.0,  # 让回答更加自由
    top_p=0.9,  # 让模型考虑更多可能性
    top_k=50,  # 增加多样性
    repetition_penalty=1.2,  # 防止重复
    no_repeat_ngram_size=3,  # 避免生成相同的三元组短语
    eos_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Human: 不是说做b超对宝宝不好吗？那怀孕检查是不？不是说做b超对宝宝不好吗？那怀孕检查是不是越少越好。无麻烦解答，谢谢。

Assistant: 早期检查有利于监测胎儿发育情况，是为了发现并及时处理那些可影响后续妊娠发展的病情。在3个月以后进行产前诊断也是很重要的，如果孩子有什么遗传性疾病或者器官畸形就可以通过X光等方法提前治疗或防范，对母子都是非常必要和不可缺少的一部分


In [10]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): LlamaDecoder

In [11]:
for name, param in model.named_parameters():
    print(name, param.shape, param.dtype)

model.embed_tokens.weight torch.Size([128256, 4096]) torch.float16
model.layers.0.self_attn.q_proj.weight torch.Size([8388608, 1]) torch.uint8
model.layers.0.self_attn.k_proj.weight torch.Size([2097152, 1]) torch.uint8
model.layers.0.self_attn.v_proj.weight torch.Size([2097152, 1]) torch.uint8
model.layers.0.self_attn.o_proj.weight torch.Size([8388608, 1]) torch.uint8
model.layers.0.mlp.gate_proj.weight torch.Size([29360128, 1]) torch.uint8
model.layers.0.mlp.up_proj.weight torch.Size([29360128, 1]) torch.uint8
model.layers.0.mlp.down_proj.weight torch.Size([29360128, 1]) torch.uint8
model.layers.0.input_layernorm.weight torch.Size([4096]) torch.float16
model.layers.0.post_attention_layernorm.weight torch.Size([4096]) torch.float16
model.layers.1.self_attn.q_proj.weight torch.Size([8388608, 1]) torch.uint8
model.layers.1.self_attn.k_proj.weight torch.Size([2097152, 1]) torch.uint8
model.layers.1.self_attn.v_proj.weight torch.Size([2097152, 1]) torch.uint8
model.layers.1.self_attn.o_pro

# step5:配置QLoRA

In [12]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],  # 适配Llama架构
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 打印可训练参数占比

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


# step5:配置训练参数

In [13]:
args = TrainingArguments(
    output_dir="../output",
    per_device_train_batch_size=4,          # 根据显存调整
    gradient_accumulation_steps=4,          # 实际batch_size=32
    learning_rate=2e-5,                      # 适合4bit训练
    num_train_epochs=3,
    logging_steps=20,
    evaluation_strategy="no",
    save_strategy="epoch",
    gradient_checkpointing=False,  # 禁用梯度检查点
    fp16=True,                              # 混合精度训练
    optim="paged_adamw_32bit",              # 优化内存使用
    report_to="wandb",
    remove_unused_columns=False   # 保留必要列
)




# step6:创建训练器

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"].select(range(50000)),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        padding=True,
        pad_to_multiple_of=8  # 优化显存使用
    ),
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# step7:开始训练

In [15]:
trainer.train()



Step,Training Loss
20,3.2248
40,3.0194
60,2.8169
80,2.6942
100,2.6659
120,2.6902
140,2.6335
160,2.5876
180,2.5883
200,2.5716




TrainOutput(global_step=9375, training_loss=2.316283463948568, metrics={'train_runtime': 28778.4043, 'train_samples_per_second': 5.212, 'train_steps_per_second': 0.326, 'total_flos': 1.47043326035755e+18, 'train_loss': 2.316283463948568, 'epoch': 3.0})

In [16]:
model.save_pretrained("../output/medical_lora_adapter")

# 推理测试

In [36]:
prompt = "Human: 红楼梦的作者是谁\n\nAssistant: "
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,  # 允许回答更长
    temperature=1.0,  # 让回答更加自由
    top_p=0.9,  # 让模型考虑更多可能性
    top_k=50,  # 增加多样性
    repetition_penalty=1.2,  # 防止重复
    no_repeat_ngram_size=3,  # 避免生成相同的三元组短语
    eos_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Human: 红楼梦的作者是谁

Assistant: 鲁迅。鲁迅的小说都是反映中国社会现状，写得很深刻的一些描绘。在我看来，你说的红楼梦是作品中的一个虚构故事吗？鲁迅还有其它小说吗？

这篇文章里面好像很多东西都挺有启发性的，不过我没弄清楚具体情况。

关于“鲁迅”，你知道吗？

根据我的了解，"鲁迅"指的是马自远，他用字号别名鲁迅，即字号鲁文旦。
