In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim.lr_scheduler import StepLR
from torchvision.datasets import Flickr30k
from torchvision import transforms
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import numpy as np

In [2]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [3]:
from datasets import load_dataset

# Load Flickr30k from Hugging Face
dataset = load_dataset("clip-benchmark/wds_flickr30k")

# Check what it looks like
print(dataset)
print("\nFirst sample:")
print(dataset['test'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train/0.tar:   0%|          | 0.00/993M [00:00<?, ?B/s]

train/1.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

train/2.tar:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

train/3.tar:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

train/4.tar:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

test/0.tar:   0%|          | 0.00/26.9M [00:00<?, ?B/s]

test/1.tar:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

test/2.tar:   0%|          | 0.00/29.1M [00:00<?, ?B/s]

test/3.tar:   0%|          | 0.00/31.2M [00:00<?, ?B/s]

test/4.tar:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 29000
    })
    test: Dataset({
        features: ['__key__', '__url__', 'jpg', 'txt'],
        num_rows: 1000
    })
})

First sample:
{'__key__': 's0000000', '__url__': '/root/.cache/huggingface/hub/datasets--clip-benchmark--wds_flickr30k/snapshots/f8af3ceee1b944bf761340682621dfafd23974d2/test/0.tar', 'jpg': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x461 at 0x7BC8BE595EE0>, 'txt': 'The man with pierced ears is wearing glasses and an orange hat.\nA man with glasses is wearing a beer can crocheted hat.\nA man with gauges and glasses is wearing a Blitz hat.\nA man in an orange hat starring at something.\nA man wears an orange hat and glasses.'}


In [4]:
def collate_fn(batch):
    """Custom collate function to handle captions"""
    images = []
    captions = []

    for item in batch:
        img = transform(item['jpg'])
        images.append(img)

        # Take first caption and tokenize
        caption_text = item['txt'].split('\n')[0]
        tokenized = tokenize(caption_text, word2idx, max_length=20)
        captions.append(tokenized)

    # Convert to tensors
    images = torch.stack(images)
    captions = torch.LongTensor(captions)  # This converts list to tensor!

    return images, captions

In [5]:
train_loader = DataLoader(dataset['train'], batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(dataset['test'], batch_size=32, shuffle=False, collate_fn=collate_fn)

print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

Train samples: 29000
Test samples: 1000


In [6]:
class AttentionModule(nn.Module):
    def __init__(self, feature_dim, hidden_dim, attention_dim):
        super(AttentionModule, self).__init__()

        # Linear layer to transform image features
        self.feature_attention = nn.Linear(feature_dim, attention_dim)

        # Linear layer to transform hidden state
        self.hidden_attention = nn.Linear(hidden_dim, attention_dim)

        # Final attention layer (outputs single score per location)
        self.attention_scores = nn.Linear(attention_dim, 1)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, features, hidden_state):
        batch_size = features.size(0)

        # Reshape features: (batch, 2048, 7, 7) -> (batch, 49, 2048)
        # 49 = 7*7 spatial locations
        features = features.view(batch_size, features.size(1), -1)  # (batch, 2048, 49)
        features = features.permute(0, 2, 1)  # (batch, 49, 2048)

        # Transform features: (batch, 49, 2048) -> (batch, 49, attention_dim)
        features_transformed = self.feature_attention(features)

        # Transform hidden state: (batch, hidden_dim) -> (batch, attention_dim)
        hidden_transformed = self.hidden_attention(hidden_state)

        # Expand hidden to match features: (batch, attention_dim) -> (batch, 49, attention_dim)
        hidden_transformed = hidden_transformed.unsqueeze(1).repeat(1, features.size(1), 1)

        # Combine and compute attention scores
        # (batch, 49, attention_dim) + (batch, 49, attention_dim) -> (batch, 49, attention_dim)
        combined = self.relu(features_transformed + hidden_transformed)

        # Get attention scores: (batch, 49, attention_dim) -> (batch, 49, 1)
        attention_scores = self.attention_scores(combined)

        # Apply softmax: (batch, 49, 1) -> (batch, 49, 1)
        attention_weights = self.softmax(attention_scores)

        # Weighted sum of features
        # (batch, 49, 1) * (batch, 49, 2048) -> (batch, 49, 2048)
        weighted_features = attention_weights * features

        # Sum across spatial locations: (batch, 49, 2048) -> (batch, 2048)
        context_vector = weighted_features.sum(dim=1)

        return context_vector, attention_weights

In [7]:
class LSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, attention_module, attention_size):
        super(LSTMWithAttention, self).__init__()

        self.hidden_size = hidden_size
        self.attention = attention_module

        # LSTM gates - each takes: word_embedding + hidden_state + attention_context
        input_dim = input_size + hidden_size + attention_size

        # Input gate
        self.W_i = nn.Linear(input_dim, hidden_size)

        # Forget gate
        self.W_f = nn.Linear(input_dim, hidden_size)

        # Cell gate
        self.W_c = nn.Linear(input_dim, hidden_size)

        # Output gate
        self.W_o = nn.Linear(input_dim, hidden_size)

        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

    def forward(self, embeddings, features):
        batch_size = embeddings.size(0)
        seq_len = embeddings.size(1)

        # Initialize hidden state and cell state
        h_t = torch.zeros(batch_size, self.hidden_size).to(embeddings.device)
        c_t = torch.zeros(batch_size, self.hidden_size).to(embeddings.device)

        outputs = []

        # Process sequence word by word
        for t in range(seq_len):
            # Get current word embedding: (batch, embed_dim)
            x_t = embeddings[:, t, :]

            # Get attention context based on current hidden state
            # context: (batch, 2048)
            context, attention_weights = self.attention(features, h_t)

            # Concatenate: word + hidden + attention context
            # (batch, embed_dim + hidden_size + 2048)
            lstm_input = torch.cat([x_t, h_t, context], dim=1)

            # Compute gates (following LSTM equations from the document)
            i_t = self.sigmoid(self.W_i(lstm_input))  # Input gate
            f_t = self.sigmoid(self.W_f(lstm_input))  # Forget gate
            o_t = self.sigmoid(self.W_o(lstm_input))  # Output gate

            # Compute candidate cell state
            c_tilde = self.tanh(self.W_c(lstm_input))

            # Update cell state
            c_t = f_t * c_t + i_t * c_tilde

            # Update hidden state
            h_t = o_t * self.tanh(c_t)

            outputs.append(h_t.unsqueeze(1))

        # Stack all outputs: list of (batch, 1, hidden) -> (batch, seq_len, hidden)
        outputs = torch.cat(outputs, dim=1)

        return outputs

In [8]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, embedding_layer, vocab_size, embed_dim=300, lstm_hidden=512, attention_dim=512):
        super(ImageCaptioningModel, self).__init__()

        # ResNet
        resnet = models.resnet50(pretrained=True)
        self.cnn = nn.Sequential(*list(resnet.children())[:-2])

        for param in self.cnn.parameters():
            param.requires_grad = False

        self.embedding = embedding_layer

        # Attention
        self.attention = AttentionModule(
            feature_dim=2048,  # ResNet50 output channels
            hidden_dim=lstm_hidden,
            attention_dim=attention_dim
        )

        # 4. LSTM with attention
        self.lstm = LSTMWithAttention(
            input_size=embed_dim,
            hidden_size=lstm_hidden,
            attention_module=self.attention,
            attention_size=2048  # ResNet features size
        )

        # 5. Output layer
        self.fc = nn.Linear(lstm_hidden, vocab_size)

    def forward(self, images, captions):
        features = self.cnn(images)  # Shape: (batch, 2048, 7, 7)
        embeddings = self.embedding(captions)  # Shape: (batch, seq_len, embed_dim)
        lstm_out = self.lstm(embeddings, features)

        outputs = self.fc(lstm_out)

        return outputs

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
from collections import Counter

# ===== 1. BUILD VOCABULARY FROM HUGGING FACE DATASET =====
print("Building vocabulary from captions...")
word_counter = Counter()

for item in dataset['train']:
    captions = item['txt'].split('\n')
    for caption in captions:
        words = caption.lower().split()
        word_counter.update(words)

# Create vocabulary (most common words)
vocab_size = 10000  # You can adjust this
most_common = word_counter.most_common(vocab_size - 2)  # -2 for special tokens

# Build word mappings
word2idx = {'<PAD>': 0, '<START>': 1}  # Special tokens
idx2word = ['<PAD>', '<START>']

for word, _ in most_common:
    idx = len(word2idx)
    word2idx[word] = idx
    idx2word.append(word)

vocab_size = len(word2idx)
print(f"Vocabulary size: {vocab_size}")

# ===== 2. CREATE SIMPLE EMBEDDING LAYER (NO WORD2VEC NEEDED) =====
embed_dim = 300
embedding_layer = nn.Embedding(vocab_size, embed_dim)

print(f"Embedding layer created with {vocab_size} words, {embed_dim} dimensions")

# ===== 3. TOKENIZATION FUNCTIONS =====
def tokenize(sentence, word2idx, max_length=20):
    """Convert sentence to token IDs"""
    tokens = [word2idx['<START>']]

    for word in sentence.lower().split():
        if word in word2idx:
            tokens.append(word2idx[word])
        # Skip unknown words

    # Pad or truncate
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
    else:
        tokens = tokens + [word2idx['<PAD>']] * (max_length - len(tokens))

    return tokens

def untokenize(tokens, idx2word):
    """Convert token IDs back to words"""
    words = []
    for token in tokens:
        if token < len(idx2word) and idx2word[token] not in ['<PAD>', '<START>']:
            words.append(idx2word[token])
    return words


Building vocabulary from captions...
Vocabulary size: 10000
Embedding layer created with 10000 words, 300 dimensions


In [11]:
model = ImageCaptioningModel(
    embedding_layer=embedding_layer,
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    lstm_hidden=512,
    attention_dim=512
)



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:03<00:00, 33.1MB/s]


In [12]:
num_epochs = 10
model = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding token
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [17]:
def generate_caption(model, image, word2idx, idx2word, max_length=20):
    """Generate caption for a single image"""
    model.eval()

    # Start with START token
    caption = [word2idx['<START>']]

    for _ in range(max_length - 1):
        caption_tensor = torch.LongTensor([caption]).to(image.device)

        with torch.no_grad():
            outputs = model(image, caption_tensor)

        # Get prediction for last word (highest probability)
        prediction = outputs[0, -1, :].argmax().item()
        caption.append(prediction)

        # Stop if PAD token
        if prediction == word2idx['<PAD>']:
            break

    return caption

In [None]:
for epoch in range(num_epochs):
    # ===== TRAINING =====
    model.train()
    train_loss = 0
    train_steps = 0

    for images, captions in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        images = images.to(device)
        captions = captions.to(device)

        optimizer.zero_grad()

        # Forward pass: predict next words given previous words
        outputs = model(images, captions[:, :-1])  # Input: all except last word

        # Reshape for loss calculation
        outputs = outputs.reshape(-1, outputs.size(-1))  # (batch*seq_len, vocab_size)
        targets = captions[:, 1:].reshape(-1)  # Target: all except first word

        loss = criterion(outputs, targets)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        optimizer.step()

        train_loss += loss.item() * images.size(0)
        train_steps += images.size(0)

    avg_train_loss = train_loss / train_steps

    # ===== VALIDATION =====
    model.eval()
    val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for images, captions in tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            images = images.to(device)
            captions = captions.to(device)

            outputs = model(images, captions[:, :-1])
            outputs = outputs.reshape(-1, outputs.size(-1))
            targets = captions[:, 1:].reshape(-1)

            loss = criterion(outputs, targets)
            val_loss += loss.item() * images.size(0)
            val_steps += images.size(0)

    avg_val_loss = val_loss / val_steps

    # ===== GENERATE SAMPLE CAPTION =====
    model.eval()
    with torch.no_grad():
        # Take first image from test set
        sample_item = dataset['test'][0]
        sample_image = transform(sample_item['jpg']).unsqueeze(0).to(device)
        actual_caption = sample_item['txt'].split('\n')[0]

        # Generate caption
        generated_tokens = generate_caption(model, sample_image, word2idx, idx2word, max_length=20)
        generated_caption = ' '.join(untokenize(generated_tokens, idx2word))

        print(f"\n{'='*60}")
        print(f"Sample Caption Generation:")
        print(f"Actual   : {actual_caption}")
        print(f"Generated: {generated_caption}")
        print(f"{'='*60}")

    # ===== SCHEDULER STEP =====
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

    # ===== PRINT EPOCH RESULTS =====
    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Val   Loss: {avg_val_loss:.4f}")
    print(f"Learning Rate: {current_lr:.6f}")
    print(f"Perplexity: {np.exp(avg_val_loss):.2f}")  # Common metric for language models
    print('-'*60)

print("Training complete!")

Epoch 1/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 1/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a black shirt and a black hat is holding a cigarette in the background. with a

Epoch [1/10]
Train Loss: 3.8047
Val   Loss: 3.7548
Learning Rate: 0.001000
Perplexity: 42.73
------------------------------------------------------------


Epoch 2/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 2/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man wearing a black shirt and a woman wearing a black shirt and a woman in a black

Epoch [2/10]
Train Loss: 3.4678
Val   Loss: 3.6531
Learning Rate: 0.001000
Perplexity: 38.59
------------------------------------------------------------


Epoch 3/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 3/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a black hat and a hat is standing in front of a microphone with a man

Epoch [3/10]
Train Loss: 3.2146
Val   Loss: 3.5968
Learning Rate: 0.001000
Perplexity: 36.48
------------------------------------------------------------


Epoch 4/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 4/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a beard and a hat is holding a drink and a woman in a black jacket

Epoch [4/10]
Train Loss: 2.9999
Val   Loss: 3.5957
Learning Rate: 0.001000
Perplexity: 36.44
------------------------------------------------------------


Epoch 5/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 5/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a black shirt and black hat is holding a bottle of paper in a mirror." hat.

Epoch [5/10]
Train Loss: 2.8110
Val   Loss: 3.6046
Learning Rate: 0.001000
Perplexity: 36.77
------------------------------------------------------------


Epoch 6/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 6/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a beard and a black shirt with a beard and a woman with a black hat

Epoch [6/10]
Train Loss: 2.6440
Val   Loss: 3.6457
Learning Rate: 0.001000
Perplexity: 38.31
------------------------------------------------------------


Epoch 7/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

Epoch 7/10 - Validation:   0%|          | 0/32 [00:00<?, ?it/s]


Sample Caption Generation:
Actual   : The man with pierced ears is wearing glasses and an orange hat.
Generated: a man with a beard and a gray shirt is holding a bottle of paper in a dimly lit

Epoch [7/10]
Train Loss: 2.4967
Val   Loss: 3.6918
Learning Rate: 0.001000
Perplexity: 40.12
------------------------------------------------------------


Epoch 8/10 - Training:   0%|          | 0/907 [00:00<?, ?it/s]

In [None]:
#training took too long due to the model complexity