In [None]:
#设备加载 ，如果有安装cuda，则使用cuda，否则使用cpu，运行后会显示使用了什么设备
import torch
import os
from peft import LoraConfig ,get_peft_model
from transformers import Qwen2VLForConditionalGeneration,generation
import pandas as pd
from datasets import Dataset
from transformers import TrainingArguments
from transformers import Trainer
from transformers.data.data_collator import DataCollatorForSeq2Seq
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
BASEDIR = "./"

In [None]:
#分词器加载
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    use_fast=False, 
    trust_remote_code=True
)

In [None]:

#模型加载
#模型路径
model_dir = os.path.join(BASEDIR,'model/Qwen2-VL-2B-Instruct')
print(model_dir)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir,

    device_map =device,#使用cuda设备
    
    torch_dtype=torch.bfloat16#使用半精度浮点数
)




In [None]:
#使用模型


input_text = input("请输入文字：")
input_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

output = model.generate(input_text, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
# model.train()

In [None]:
#查看模型有哪些部分
for name, module in model.named_modules():
    print(name)

In [None]:
#加载lora配置
config = LoraConfig(
    r=8,                     # 增大秩
    lora_alpha=16,          # 增大 alpha
    lora_dropout=0.2,       # 增大 dropout 率以防止过拟合
    target_modules=["q_proj", "k_proj", "v_proj", "fc1", "fc2"]  # 扩展到 MLP 层
)

#将模型和lora配置结合
model = get_peft_model(model, config)

In [None]:
#检查分词器
sents = [
    '你站在桥上看风景',
    '看风景的人在楼上看你',
    '明月装饰了你的窗子',
    '你装饰了别人的梦',
]
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs = [(sents[0],sents[1]),(sents[2],sents[3])],
    text_pair = sents[1],
    truncation = True,
    padding = 'max_length',
    add_special_tokens =True,
    max_length = 25,
    return_tensors = None,
    return_token_type_ids = True,
    return_attention_mask = True,
    return_special_tokens_mask = True,
    # return_offsets_mapping = True,
    return_length = True,
    
)
for k , v in out.items():
    print(k,':',v)
print(tokenizer.decode(out['input_ids'][0]))

In [None]:
dataset_dir = os.path.join(BASEDIR,"train_TCG/train.jsonl")
print(dataset_dir)

def process(data):
    '''
    将传入数据编码，
    '''
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<| im_start |>system\n{data['instruction']}<|im_end|>\n <|im_start|>user\n {data['input']} <|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
        padding=True,
        truncation=True
        )
    response = tokenizer(f"{data['output']} <|im_end|>", add_special_tokens=False)
    input_ids = instruction['input_ids']+response['input_ids']+[tokenizer.pad_token_id]
    attention_mask = instruction['attention_mask']+response['attention_mask']+[1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    
    return {
        "input_ids":input_ids,
        "attention_mask":attention_mask,
        "labels":labels
    }

datadf = pd.read_json(dataset_dir,lines=True)
datads = Dataset.from_pandas(datadf)
train_data = datads.map(
    process,
    remove_columns=datads.column_names
    )

# with open(data_path,'r',encoding='utf-8') as f:
#     for line in f:
#         instruct = f["instruction"]
#         input = f['input']
#         output = f['output']

#这上面的是针对jsonl文件的处理

In [None]:
#查看数据集
print(train_data['input_ids'])
decode_data = tokenizer.batch_decode(train_data['input_ids'][:10], skip_special_tokens=True)
print(decode_data)

In [None]:
#加载训练器

#训练器超参数
args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    remove_unused_columns=False,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

In [None]:
for param in model.parameters():
      param.requires_grad = True
      print(param.requires_grad)
    # param.requires_grad = True

In [None]:
trainer.train()

In [None]:
#模型使用，前面已经导入模型的情况下


model_dir_trained = os.path.join(BASEDIR,"output_dir/checkpoint-4")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir_trained,

    device_map =device,#使用cuda设备
    
    torch_dtype=torch.bfloat16#使用半精度浮点数
)


In [None]:


input_text = input("请输入文字：")
input_text = tokenizer.encode(input_text, return_tensors="pt").to(device)

output = model.generate(input_text, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))