In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.cpu().numpy()

def get_text_embedding(text):
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_text_features(**inputs)
    return embedding.cpu().numpy()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, ViTModel

class VisionLanguageModel(nn.Module):
    def __init__(self):
        super(VisionLanguageModel, self).__init__()
        # Use CLIP model instead of separate ViT and BERT
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.decoder = nn.Linear(512, 512)  # CLIP uses 512 dimensions

    def forward(self, image_features, text):
        # Assuming image_features is already processed through CLIP
        text_features = self.clip_model.get_text_features(**text)
        combined = torch.cat((image_features, text_features), dim=1)
        return self.decoder(combined)

model = VisionLanguageModel()


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import os
import json
from PIL import Image
from transformers import BertTokenizer
from torch.utils.data import Dataset
import torchvision.transforms as transforms

class VLM_Dataset(Dataset):
    def __init__(self, data_path, annotations_path, max_samples=None):
        self.data_path = data_path

        # Verify data directory exists
        if not os.path.exists(data_path):
            raise RuntimeError(f"Data directory not found: {data_path}")

        # Load and process COCO annotations
        if not os.path.exists(annotations_path):
            raise RuntimeError(f"Annotations file not found: {annotations_path}")

        with open(annotations_path, 'r') as f:
            coco = json.load(f)

        # Create image_id to filename mapping
        self.image_to_file = {
            img['id']: img['file_name']
            for img in coco['images']
        }

        # Store annotations with proper image filenames
        self.annotations = []
        for ann in coco['annotations']:
            img_filename = self.image_to_file[ann['image_id']]
            img_path = os.path.join(data_path, img_filename)

            # Only add annotations for images that exist
            if os.path.exists(img_path):
                self.annotations.append({
                    'image': img_filename,
                    'caption': ann['caption']
                })

        if len(self.annotations) == 0:
            raise RuntimeError(
                f"No valid images found in {data_path}. "
                "Please download the COCO 2017 training images from "
                "https://cocodataset.org/#download"
            )

        # Optionally limit dataset size
        if max_samples is not None:
            self.annotations = self.annotations[:max_samples]

        print(f"Loaded dataset with {len(self.annotations)} valid images")

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_filename = self.annotations[idx]['image']
        img_path = os.path.join(self.data_path, img_filename)

        # Load and process image
        try:
            image = Image.open(img_path).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f"Error loading image {img_path}: {str(e)}")
            raise

        # Process text
        text = self.annotations[idx]['caption']
        text_tokenized = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            max_length=50,
            truncation=True
        )

        # Remove the batch dimension that tokenizer adds
        text_tokenized = {k: v.squeeze(0) for k, v in text_tokenized.items()}

        return image, text_tokenized

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
#from models import VisionLanguageModel
#from prepare_data import VLM_Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time
from pathlib import Path

# Hyperparameters
epochs = 10
batch_size = 8
lr = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create directories for checkpoints
Path("checkpoints").mkdir(exist_ok=True)

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    start_time = time.time()

    for batch_idx, (images, texts) in enumerate(dataloader):
        # Move data to device
        images = images.to(device)
        texts = {k: v.to(device) for k, v in texts.items()}

        # Forward pass
        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs, texts["input_ids"][:, 0])

        # Backward pass
        loss.backward()
        optimizer.step()

        # Update statistics
        total_loss += loss.item()

        # Print progress every 10 batches
        if (batch_idx + 1) % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            images_per_sec = (batch_idx + 1) * batch_size / (time.time() - start_time)
            print(f"Batch [{batch_idx + 1}/{len(dataloader)}] "
                  f"Loss: {avg_loss:.4f} "
                  f"Speed: {images_per_sec:.2f} images/sec")

    return total_loss / len(dataloader)

def main():
    # Initialize model and move to device
    print(f"Using device: {device}")
    model = VisionLanguageModel().to(device)

    # Load dataset
    dataset = VLM_Dataset(
        "data/images",
        "data/annotations/captions_val2017.json",
        max_samples=1000
    )
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True if torch.cuda.is_available() else False
    )

    # Initialize loss, optimizer, and scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    # Training loop
    best_loss = float('inf')
    for epoch in range(epochs):
        print(f"\nEpoch [{epoch + 1}/{epochs}]")

        # Train one epoch
        train_loss = train_epoch(model, dataloader, criterion, optimizer, device)

        # Update learning rate
        scheduler.step(train_loss)

        # Save checkpoint if best model
        if train_loss < best_loss:
            best_loss = train_loss
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': train_loss,
            }
            torch.save(checkpoint, 'checkpoints/best_model.pth')
            print(f"Saved new best model with loss: {train_loss:.4f}")

        print(f"Epoch {epoch + 1} Summary:")
        print(f"Average Loss: {train_loss:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")

if __name__ == "__main__":
    main()

Using device: cpu


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded dataset with 1000 valid images





Epoch [1/10]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
import torch
#from models import VisionLanguageModel
#from embedding import get_image_embedding, get_text_embedding
from transformers import BertTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionLanguageModel().to(device)
checkpoint_path = "/content/checkpoints/best_model.pth"
checkpoint = torch.load(checkpoint_path, map_location=device)

# Ensure only model state_dict is loaded
if "model_state_dict" in checkpoint:
    model.load_state_dict(checkpoint["model_state_dict"], strict=False)  # Allow missing keys
else:
    model.load_state_dict(checkpoint, strict=False)
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def generate_caption(image_path):
    # Get image embedding using CLIP
    image_emb = get_image_embedding(image_path)
    image_emb = torch.tensor(image_emb).to(device)
    
    # Process text input
    text_input = processor(
        text=["Describe this image"],
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    
    with torch.no_grad():
        output = model(image_emb, text_input)
    
    # Convert output to text using processor
    return processor.decode(output.argmax(dim=-1)[0])


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(checkpoint_path, map_location=device)


FileNotFoundError: [Errno 2] No such file or directory: '/content/checkpoints/best_model.pth'

In [None]:
print(generate_caption("/content/drive/MyDrive/example/football.jpg"))