In [None]:
!pip install -U git+https://github.com/huggingface/transformers.git datasets trl peft accelerate bitsandbytes pillow

In [1]:
import os
from datasets import load_dataset, load_from_disk
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
import torch
from peft import get_peft_model, LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load Dataset
ds = load_dataset('HuggingFaceM4/VQAv2', split="train[:10%]")
cols_remove = ["question_type", "answers", "answer_type", "image_id", "question_id"]
ds = ds.remove_columns(cols_remove)
split_ds = ds.train_test_split(test_size=0.05)
train_ds = split_ds["test"]
print(train_ds[0])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


{'multiple_choice_answer': 'yes', 'question': 'Is the picture black and white?', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=516x640 at 0x16DA5D180>}


In [3]:
# 2. Load Model
model_id = "google/paligemma-3b-pt-224"
processor = PaliGemmaProcessor.from_pretrained(model_id)
device = "cpu"
image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

for param in model.vision_tower.parameters():
    param.requires_grad = False 
for param in model.multi_modal_projector.parameters():
    param.requires_grad = False

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 3/3 [00:29<00:00,  9.75s/it]


In [None]:
# Loading Quantised Model
'''
BitsAndBytesConfig 配置模型的量化参数。量化是一种减少模型大小和计算需求的方法。
这里指定模型将使用4位量化，并在计算过程中使用 bfloat16 数据类型。
'''

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,# 加载模型时使用4位量化
    bnb_4bit_quant_type="nf4", # 使用NF4（normalized float 4）量化类型
    bnb_4bit_compute_type=torch.bfloat16 # 使用BFLOAT16进行计算
)

'''
LoRA 是一种低秩适应技术，用于减少大模型的参数更新。
LoraConfig 配置了 LoRA 的参数，包括秩和需要适应的目标模块。

'''

lora_config = LoraConfig(
    r=8, # 矩阵的秩
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj",
                    "gate_proj", "up_proj", "down_proj"], # 目标模块
    task_type="CAUSAL_LM", # 任务类型
)

In [None]:
# 有gpu的时候使用
#model = PaliGemmaForConditionalGeneration.from_pretrained(model_id,
#                                                          quantization_config=bnb_config,
 #                                                         device_map={"": 0})

In [None]:
# 加载量化模型，并移除 device_map
# 有cpu的时候使用
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)

# 将模型移动到 CPU
model.to('cpu')


In [None]:
# 加载 LoRA 配置
model = get_peft_model(model, lora_config)
# 打印可训练的参数
model.print_trainable_parameters()

In [None]:
# 3. Fine-tuning the model
def collate_fn(examples):
    texts = ["answer " + example["question"] for example in examples]
    labels = [example['multiple_choice_answer'] for example in examples]
    images = [example["image"].convert("RGB") for example in examples]
    tokens = processor(text=texts, images=images, suffix=labels,
                       return_tensors="pt", padding="longest",
                       tokenize_newline_separately=False)
    tokens = tokens.to(torch.bfloat16).to(device)
    return tokens

In [None]:


args = TrainingArguments(
    num_train_epochs=2,
    remove_unused_columns=False,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=2,
    learning_rate=2e-5,
    weight_decay=1e-6,
    adam_beta2=0.999,
    logging_steps=100,
    optim="adamw_hf",
    save_strategy="steps",
    save_steps=1000,
    push_to_hub=True,
    save_total_limit=1,
    output_dir="paligemma_vqav2",
    bf16=True,
    dataloader_pin_memory=False
)



In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    data_collator=collate_fn,
    args=args
)



In [None]:
trainer.train()



In [None]:
# 4. Save the model in HuggingFace
trainer.push_to_hub('mervinpraison/paligemma_VQAv2')

In [None]:
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
import torch
from huggingface_hub import login, hf_hub_download, HfApi, HfFolder

# 登录 Hugging Face
api = HfApi()
token = "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"
login(token=token, add_to_git_credential=True)

# 确认访问权限
model_id = "google/paligemma-3b-pt-224"
username = api.whoami(token)["name"]
permissions = api.model_info(repo_id=model_id, token=token).card_data

if not permissions:
    raise Exception(f"您没有访问 {model_id} 的权限，请访问 {model_id} 页面并请求访问权限。")

# 加载处理器和模型
try:
    processor = PaliGemmaProcessor.from_pretrained(model_id)
    device = "cpu"
    image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

    # 冻结部分模型参数
    for param in model.vision_tower.parameters():
        param.requires_grad = False 
    for param in model.multi_modal_projector.parameters():
        param.requires_grad = False
except OSError as e:
    print(f"无法访问模型存储库：{e}")
    print(f"请确保您已经请求并获得访问权限：{model_id}")
