In [5]:
#设备加载 ，如果有安装cuda，则使用cuda，否则使用cpu，运行后会显示使用了什么设备
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
print(torch.__version__)

cuda
2.5.1+cu124


In [6]:
#使用相对路径
import os
BASEDIR = "./"

In [7]:
#模型路径
model_dir = os.path.join(BASEDIR,'../model/Qwen2-VL-7B-Instruct')
print(model_dir)

./../model/Qwen2-VL-7B-Instruct


In [8]:
#分词器加载
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    use_fast=False, 
    trust_remote_code=True
)

In [9]:
from transformers import Qwen2VLForConditionalGeneration
#模型加载
#模型路径

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir,

    device_map =device,#使用cuda设备
    
    torch_dtype=torch.bfloat16#使用半精度浮点数
)




`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


In [None]:
#使用模型
from transformers import generation

input_text = input("请输入文字：")
input_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

output = model.generate(input_text, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))


In [10]:
model.train()

Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

In [None]:
#查看模型有哪些部分
for name, module in model.named_modules():
    print(name)
for name, param in model.named_parameters():
    print(name)

In [11]:
#加载lora配置
from peft import LoraConfig ,get_peft_model

config = LoraConfig(
    r=8,                     # 增大秩
    lora_alpha=16,          # 增大 alpha
    lora_dropout=0.2,       # 增大 dropout 率以防止过拟合
    target_modules=["q_proj", "k_proj", "v_proj", "fc1", "fc2"]  # 扩展到 MLP 层
)

#将模型和lora配置结合
model = get_peft_model(model, config)

In [None]:
#检查分词器
sents = [
    '你站在桥上看风景',
    '看风景的人在楼上看你',
    '明月装饰了你的窗子',
    '你装饰了别人的梦',
]
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs = [(sents[0],sents[1]),(sents[2],sents[3])],
    text_pair = sents[1],
    truncation = True,
    padding = 'max_length',
    add_special_tokens =True,
    max_length = 25,
    return_tensors = None,
    return_token_type_ids = True,
    return_attention_mask = True,
    return_special_tokens_mask = True,
    # return_offsets_mapping = True,
    return_length = True,
    
)
for k , v in out.items():
    print(k,':',v)
print(tokenizer.decode(out['input_ids'][0]))

In [23]:
#数据集的处理，将jsonl文件转化为huggingface使用的dataset文件
#文件路径
import pandas as pd
from datasets import Dataset
dataset_dir = os.path.join(BASEDIR,"output.jsonl")
print(dataset_dir)


./output.jsonl


In [24]:
def process(data):
    '''
    将传入数据编码，
    '''
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<| im_start |>system\n{data['instruction']}<|im_end|>\n <|im_start|>user\n {data['input']} <|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
        padding=True,
        truncation=True
        )
    response = tokenizer(f"{data['output']} <|im_end|>", add_special_tokens=False)
    input_ids = instruction['input_ids']+response['input_ids']+[tokenizer.pad_token_id]
    attention_mask = instruction['attention_mask']+response['attention_mask']+[1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    
    return {
        "input_ids":input_ids,
        "attention_mask":attention_mask,
        "labels":labels
    }

datadf = pd.read_json(dataset_dir,lines=True)
datads = Dataset.from_pandas(datadf)
train_data = datads.map(
    process,
    remove_columns=datads.column_names
    )



Map: 100%|██████████| 1059/1059 [00:00<00:00, 1397.08 examples/s]


In [25]:
#查看数据集
print(train_data['input_ids'])
decode_data = tokenizer.batch_decode(train_data['input_ids'][:10], skip_special_tokens=True)
print(decode_data)

[[27, 91, 732, 4906, 58721, 8948, 198, 101254, 151645, 198, 220, 151644, 872, 198, 220, 108386, 11, 14880, 109432, 107828, 220, 151645, 198, 151644, 77091, 198, 108386, 11, 97611, 101419, 99882, 103303, 100167, 11, 100141, 100659, 111285, 100167, 100167, 11, 20412, 91680, 15469, 107938, 11, 99729, 105292, 11, 101170, 99242, 220, 151645, 151643], [27, 91, 732, 4906, 58721, 8948, 198, 101254, 151645, 198, 220, 151644, 872, 198, 220, 109194, 26232, 109432, 107828, 81596, 11319, 220, 151645, 198, 151644, 77091, 198, 104198, 103303, 100167, 11, 99639, 91680, 15469, 107938, 11, 67071, 103303, 103303, 100013, 11, 99729, 105292, 220, 151645, 151643], [27, 91, 732, 4906, 58721, 8948, 198, 101254, 151645, 198, 220, 151644, 872, 198, 220, 56568, 102021, 11319, 220, 151645, 198, 151644, 77091, 198, 35946, 99639, 91680, 67071, 103303, 103303, 100013, 9370, 15469, 107938, 11, 99729, 105292, 11, 101170, 99242, 57218, 104939, 220, 151645, 151643], [27, 91, 732, 4906, 58721, 8948, 198, 101254, 151645, 

In [19]:
#加载训练器
from transformers import TrainingArguments
from transformers import Trainer
from transformers.data.data_collator import DataCollatorForSeq2Seq
#训练器超参数
args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    remove_unused_columns=False,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)




In [21]:
for param in model.parameters():
      param.requires_grad = True
      print(param.requires_grad)
    # param.requires_grad = True

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [22]:
trainer.train()

  0%|          | 0/132 [00:10<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB. GPU 0 has a total capacity of 24.00 GiB of which 0 bytes is free. Of the allocated memory 22.87 GiB is allocated by PyTorch, and 296.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#模型使用，前面已经导入模型的情况下
#模型加载
#模型路径
from transformers import Qwen2VLForConditionalGeneration
model_dir_trained = os.path.join(BASEDIR,"output_dir/checkpoint-4")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir_trained,

    device_map =device,#使用cuda设备
    
    torch_dtype=torch.bfloat16#使用半精度浮点数
)


In [None]:
from transformers import generation

input_text = input("请输入文字：")
input_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

output = model.generate(input_text, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))