In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import torch


model_path="/mnt/d/skinalor/model/Lingshu-7B"
# load Lingshu-7B
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
 
# load tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
 
# Allow gradient updates
model.enable_input_require_grads()

2025-10-29 13:29:35.942454: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-29 13:29:36.163011: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761744576.249360   33529 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761744576.274393   33529 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1761744576.476651   33529 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
from datasets import Dataset
import json


data_path = "path/to/your/dataset.json" 
with open(data_path, 'r') as f:
    data = json.load(f)
    train_data = data[:-4] 
    test_data = data[-4:]


with open("train_data.json", "w") as f:
    json.dump(train_data, f)
with open("test_data.json", "w") as f:
    json.dump(test_data, f)


train_ds = Dataset.from_json("train_data.json")

In [None]:
from qwen_vl_utils import process_vision_info
import torch

def process_func(example):
    """
    预处理输入数据
    """
    MAX_LENGTH = 8192
    conversation = example["conversations"]
    input_content = conversation[0]["value"]
    output_content = conversation[1]["value"]

    file_path = input_content.split("<|vision_start|>")[1].split("<|vision_end|>")[0]

    # 构造多模态对话
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"{file_path}", "resized_height": 256, "resized_width": 256},
                {"type": "text", "text": "请描述这张图片的内容。"},
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    inputs = {key: value.tolist() for key, value in inputs.items()}
    

    response = tokenizer(f"{output_content}", add_special_tokens=False)
    input_ids = inputs["input_ids"][0] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = inputs["attention_mask"][0] + response["attention_mask"] + [1]
    labels = [-100] * len(inputs["input_ids"][0]) + response["input_ids"] + [tokenizer.pad_token_id]


    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(attention_mask),
        "labels": torch.tensor(labels),
        "pixel_values": torch.tensor(inputs["pixel_values"]),
        "image_grid_thw": torch.tensor(inputs["image_grid_thw"]).squeeze(0)
    }

In [None]:

train_dataset = train_ds.map(process_func)


print(f"Train dataset size: {len(train_dataset)}")
print(train_dataset[0]) 

In [None]:
from peft import LoraConfig, get_peft_model
 
config = LoraConfig(
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
)
 

peft_model = get_peft_model(model, config)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
 
args = TrainingArguments(
    output_dir="output/Qwen2.5-VL-LoRA",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=5,
    save_steps=74,
    learning_rate=1e-4,
    gradient_checkpointing=True,
)
 
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=train_dataset,  
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
 
trainer.train()

In [None]:
from peft import PeftModel
 
peft_model_path = "output/Qwen2.5-VL-LoRA/checkpoint-XXX"
val_peft_model = PeftModel.from_pretrained(model, peft_model_path, config=config)
 
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "path/to/image.jpg"},
            {"type": "text", "text": "请描述这张图片的内容。"},
        ],
    }
]
 
def predict(messages, model):
    """ 用于推理验证的函数 """
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)
 
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    # 取生成的后半部分
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
 
response = predict(messages, val_peft_model)
print(response)

In [5]:
!bash ../scripts/sft_lingshu_7b_1.sh

12874.24s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[INFO] nproc_per_node=1 master=127.0.0.1:22162
2025-10-29 17:04:07.305038: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-29 17:04:07.312711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761757447.321157  134247 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761757447.323822  134247 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1761757447.330994  134247 computation_placer.cc:177] computation placer already