In [1]:
import torch
from PIL import Image
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Blip2ForConditionalGeneration,
    Blip2Processor,
)


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

model = Blip2ForConditionalGeneration.from_pretrained(
    "./models/blip2",
    torch_dtype=dtype,
)
processor = Blip2Processor.from_pretrained("./models/blip2_processor")
model = model.to(device)
print("Reloaded model from ./models/blip2")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Blip2ForConditionalGeneration were not initialized from the model checkpoint at ./models/blip2 and are newly initialized: ['language_projection.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Reloaded model from ./models/blip2


In [4]:
image = Image.open("./data/cats.jpeg").convert("RGB")
prompt = "Question: Describe the image. Answer:"

inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, dtype=dtype)
print(inputs)
generated_ids = model.generate(**inputs, max_new_tokens=128)
print(generated_ids)
output = processor.decode(generated_ids[0], skip_special_tokens=True).strip()

print(output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'input_ids': tensor([[128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
         128256, 128256, 128256, 128256, 128256, 128000,  14924,     25,  61885,
            279,   2217,     13,  22559,     25]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[[[ 0.8501,  0.8501,  0.8501,  ..., -0.3032, -0.5366, -0.5806],
          [ 0.8501,  0.8501,  0.8501,  ..., -0.3616, -0.5952, -0.6245],
          [ 0.8501,  0.8501,  0.8501,  ..., -0.4492, -0.6680, -0.6538],
          ...,
          [ 1.3467,  1.3613,  1.3760,  ...,  0.2369,  0.2808,  0.2078],
          [ 1.3906,  1.3760,  1.3760,  ...,  0.1639,  0.1201,  0.0471],
          [ 1.4336,  1.

In [5]:
from datasets import load_dataset, DatasetDict
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
import evaluate
from transformers import Blip2Config, Blip2VisionConfig, Blip2ForConditionalGeneration, Blip2QFormerConfig
from transformers import AutoModel, AutoTokenizer, SwinModel, SwinConfig, AutoModelForCausalLM, AutoTokenizer
from transformers import Blip2Processor, AutoImageProcessor, BlipImageProcessor
from transformers import AddedToken
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
import torch
import torch.nn as nn
from collections import Counter

In [6]:
# Configure HuggingFace cache directories (change the base path if needed)
HF_CACHE_BASE = os.environ.get("HF_CACHE_BASE", r"D:/cache/huggingface")
HF_CACHE_BASE = os.path.abspath(HF_CACHE_BASE)
HF_DATASETS_CACHE = os.path.join(HF_CACHE_BASE, "datasets")
HF_MODELS_CACHE = os.path.join(HF_CACHE_BASE, "models")

for path in (HF_CACHE_BASE, HF_DATASETS_CACHE, HF_MODELS_CACHE):
    os.makedirs(path, exist_ok=True)

os.environ["HF_HOME"] = HF_CACHE_BASE
os.environ["HF_DATASETS_CACHE"] = HF_DATASETS_CACHE
os.environ["TRANSFORMERS_CACHE"] = HF_MODELS_CACHE
os.environ["HF_HUB_CACHE"] = HF_MODELS_CACHE

In [7]:
class VQADataset(torch.utils.data.Dataset):
    """VQA (v2) dataset."""

    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def _select_answer(self, answers, fallback=""):
        candidates = []
        if isinstance(answers, dict):
            if isinstance(answers.get("text"), list):
                candidates = answers["text"]
            elif isinstance(answers.get("answer"), list):
                candidates = answers["answer"]
            elif isinstance(answers.get("answers"), list):
                candidates = answers["answers"]
        elif isinstance(answers, list):
            if answers and isinstance(answers[0], dict):
                candidates = [a.get("text") or a.get("answer") for a in answers if a.get("text") or a.get("answer")]
            else:
                candidates = answers
        elif isinstance(answers, str):
            candidates = [answers]

        candidates = [c for c in candidates if isinstance(c, str) and c]
        if candidates:
            # make sure the answer contains at most one string
            return Counter(candidates).most_common(1)[0][0]
        return fallback

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item["question"]
        answer = self._select_answer(item.get("answers"), item.get("multiple_choice_answer", ""))
        image = item["image"]

        #encoding和decoding的max_length必须一样,这里都是64.因为language_model的logits的seq_len就是max_length,同时计算loss会logits = logits[:, -labels.size(1) :, :]，这里max_length不一样截取后loss计算就会错
        encoding = self.processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=64,
            truncation=True,
            return_tensors="pt",
        )
        labels = self.processor.tokenizer(
            answer,
            max_length=64,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        #only keep the first eos in labels, other eos would be replaced by ignored_index of loss function, so that the loss won't count them
        label_ids = labels["input_ids"].squeeze(0)
        pad_token_id = self.processor.tokenizer.pad_token_id
        eos_token_id = self.processor.tokenizer.eos_token_id
        if eos_token_id is None:
            eos_token_id = pad_token_id

        if pad_token_id is not None and pad_token_id != eos_token_id:
            label_ids[label_ids == pad_token_id] = -100

        if eos_token_id is not None:
            eos_positions = (label_ids == eos_token_id).nonzero(as_tuple=False)
            if eos_positions.numel() > 0:
                label_ids[eos_positions.flatten()] = -100
                first_eos_idx = eos_positions[0].item()
                label_ids[first_eos_idx] = eos_token_id

        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = label_ids
        return encoding

In [8]:
from datasets.download.download_config import DownloadConfig

##################################      Creating Dataset and Dataloader    ##################################
raw_dataset = load_dataset(
    "HuggingFaceM4/VQAv2",
    cache_dir=HF_DATASETS_CACHE,
    trust_remote_code=True,
     download_config= DownloadConfig(storage_options={"timeout": 3600})
)

dataset = DatasetDict({
    "train": raw_dataset["train"],
    "test": raw_dataset["test"]
})


Repo card metadata block was not found. Setting CardData to empty.


In [9]:
print(dataset["train"][0])

{'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000, 'question': 'What is this photo taken looking through?', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x10F05622980>}


In [10]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
     # 预测：转到 CPU
    logits = logits.detach().cpu().numpy()

    # 标签：把 -100 换回 eos_token_id 才能 decode
    labels = labels.clone()
    labels[labels == -100] = processor.tokenizer.eos_token_id
    labels = labels.detach().cpu().numpy()

    decoded_preds = processor.tokenizer.batch_decode(logits, skip_special_tokens=True)
    decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # We can return ROUGE-1, ROUGE-2, and ROUGE-L as needed
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
    }



train_dataset = VQADataset(dataset=dataset["train"],
                          processor=processor)
valid_dataset = VQADataset(dataset=dataset["test"],
                          processor=processor)
batch_size = 10
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=False, pin_memory=True)
loss_fct = nn.CrossEntropyLoss(ignore_index=-100, reduction="mean")

In [11]:
model.eval()
train_acc = 0
train_tqdm = tqdm(range(len(train_dataloader)), desc=f'Epoch {1} - Training loss: 0.000 - Train Acc: 0.000', position=0)
epoch_loss = 0
for idx, batch in zip(train_tqdm, train_dataloader):
    input_ids = batch.pop('input_ids').to(device)
    pixel_values = batch.pop('pixel_values').to(device)
    attention_masked = batch.pop('attention_mask').to(device)
    labels = batch.pop('labels').to(device)
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_masked,
            labels=labels,
        )
    logits = outputs.logits        # [B, T_total, V]

    # 完全照源码对齐：只取与 labels 等长的尾部，做自回归 shift
    #logits = logits[:, -labels.size(1):, :]              # [B, T, V]
    shift_logits = logits[..., :-1, :].contiguous()      # [B, T-1, V]
    shift_labels = labels[..., 1:].contiguous()          # [B, T-1]

    loss = loss_fct(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
    )

    epoch_loss += loss.item()
    pred = outputs.logits.argmax(dim=-1)
    train_acc += compute_metrics((pred, labels))["rougeL"]  # You can use rouge1,
    # Clear cache to avoid OOM
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

Epoch 1 - Training loss: 0.000 - Train Acc: 0.000:   0%|          | 0/44376 [01:19<?, ?it/s]


KeyboardInterrupt: 