In [3]:
import os
import joblib
import requests
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle

model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

class ImageCaptioningDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path, processor, image_root):
        self.dataset = joblib.load(dataset_path)
        self.processor = processor
        self.image_root = image_root
        self.W, self.H = 512, 512

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        caption = row['Caption']
        image_path = os.path.join(self.image_root, row['ImagePath'], row['Filename'])
        image = Image.open(image_path).convert("RGB")

        inputs = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )
        
        labels = inputs.input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        
        return {
            "pixel_values": inputs.pixel_values.squeeze(),
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": labels.squeeze()
        }


train_dataset = ImageCaptioningDataset('Train_Data.pkl',processor,'../Data/Training/01.원천데이터')
valid_dataset = ImageCaptioningDataset('Validation_Data.pkl',processor,'../Data/Validation/01.원천데이터/')

batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)

num_epochs = 10
patience = 3
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

print(next(iter(train_dataloader)))

{'pixel_values': tensor([[[[-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7631, -1.7631],
          [-1.7485, -1.7339, -1.7339,  ..., -1.7923, -1.7923, -1.7923],
          [-1.7193, -1.7047, -1.7047,  ..., -1.6317, -1.6171, -1.6609],
          ...,
          [-1.7777, -1.7777, -1.7631,  ..., -1.7631, -1.7777, -1.7777],
          [-1.7631, -1.7631, -1.7631,  ..., -1.7777, -1.7777, -1.7923],
          [-1.7923, -1.7777, -1.7777,  ..., -1.7777, -1.7777, -1.7485]],

         [[-1.7521, -1.7521, -1.7521,  ..., -1.7371, -1.7371, -1.7071],
          [-1.7071, -1.7071, -1.6921,  ..., -1.7521, -1.7521, -1.7521],
          [-1.6771, -1.6921, -1.6771,  ..., -1.6170, -1.5570, -1.6170],
          ...,
          [-1.7521, -1.7521, -1.7521,  ..., -1.7371, -1.7371, -1.7521],
          [-1.7371, -1.7521, -1.7221,  ..., -1.7521, -1.7221, -1.7371],
          [-1.7521, -1.7521, -1.7371,  ..., -1.7521, -1.7371, -1.7371]],

         [[-1.4802, -1.4802, -1.4802,  ..., -1.4802, -1.4376, -1.4660],
          [-1

  scaler = torch.cuda.amp.GradScaler()


In [4]:
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)
        
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        # attention_mask=attention_masked,
                        labels=labels)
            
        loss = outputs.loss
        epoch_loss += loss.item()
        # loss.backward()
        # optimizer.step()
        optimizer.zero_grad()
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    
    model.eval()
    eval_loss = 0
    for idx, batch in zip(tqdm(range(len(valid_dataloader)), desc='Validating batch: ...'), valid_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_masked,
                        labels=labels)
        
        loss = outputs.loss
        eval_loss += loss.item()

    tracking_information.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    print("Epoch: {} - Training loss: {} - Eval Loss: {} - LR: {}".format(epoch+1, epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    scheduler.step()
    if eval_loss < min_eval_loss:
        model.save_pretrained("blip-saved-model", from_pt=True) 
        print("Saved model to blip-saved-model")
        min_eval_loss = eval_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            break
    
# pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
# print("The finetuning process has done!")

Training batch: ...:   0%|          | 0/2000 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Training batch: ...: 100%|██████████| 2000/2000 [09:45<00:00,  3.41it/s]
Validating batch: ...: 100%|██████████| 250/250 [02:48<00:00,  1.48it/s]


Epoch: 1 - Training loss: 0.353774195343256 - Eval Loss: 2.215343067228794 - LR: 4e-05
Saved model to blip-saved-model


Training batch: ...: 100%|██████████| 2000/2000 [17:14<00:00,  1.93it/s]
Validating batch: ...: 100%|██████████| 250/250 [02:42<00:00,  1.54it/s]


Epoch: 2 - Training loss: 0.2784610586836934 - Eval Loss: 1.6091344608068465 - LR: 3.6e-05
Saved model to blip-saved-model


Training batch: ...: 100%|██████████| 2000/2000 [17:22<00:00,  1.92it/s]
Validating batch: ...: 100%|██████████| 250/250 [02:41<00:00,  1.55it/s]


Epoch: 3 - Training loss: 0.2630824041739106 - Eval Loss: 1.1490215846896172 - LR: 3.24e-05
Saved model to blip-saved-model


Training batch: ...: 100%|██████████| 2000/2000 [17:04<00:00,  1.95it/s]
Validating batch: ...: 100%|██████████| 250/250 [02:40<00:00,  1.56it/s]


Epoch: 4 - Training loss: 0.25818563921749593 - Eval Loss: 1.146082087814808 - LR: 2.9160000000000002e-05
Saved model to blip-saved-model


Training batch: ...: 100%|██████████| 2000/2000 [16:31<00:00,  2.02it/s]
Validating batch: ...: 100%|██████████| 250/250 [02:45<00:00,  1.51it/s]


Epoch: 5 - Training loss: 0.25346651002764703 - Eval Loss: 1.044375267982483 - LR: 2.6244e-05
Saved model to blip-saved-model


Training batch: ...:   2%|▏         | 45/2000 [00:27<19:38,  1.66it/s]


KeyboardInterrupt: 

In [18]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# 디바이스 설정 (CUDA or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 저장한 모델 및 Processor 불러오기
model = BlipForConditionalGeneration.from_pretrained(r"C:\Users\hyssk\MedicalProjects\blip-saved-model").to(device)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# 추론할 이미지 로드
image_path = r"C:\Users\hyssk\MedicalProjects\Data\Validation\01.원천데이터\VS_2.정상\5_1760.png"  # 테스트할 이미지 경로
image = Image.open(image_path).convert("RGB")

# Processor를 사용하여 입력값 생성
inputs = processor(images=image, return_tensors="pt").to(device)

# 모델 추론
model.eval()
with torch.no_grad():
    generated_ids = model.generate(**inputs)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("생성된 캡션:", caption)


생성된 캡션: this plain abdominal radiograph shows a non - specific bowel gas pattern.
