# Deep Learning Project

## Image Caption Generation

### Introduction
Image Caption generation is a challenging problem in AI that connects computer vision and NLP where a textual description must be generated for a given photograph. In General Sense for a given image as input, our model describes the exact description of an Image. It requires both image understanding from the domain of computer vision which Convolution Neural Network and a language model from the field of Natural language processing.
It is important to assume and test multiple ways to frame a given predictive modeling problem and there are indeed many ways to frame the problem of generating captions for photographs. 

So Basically what our model does is when we pass an image to our CNN and RNN combined architecture then it will generate the natural description of the image using NLP.

In [2]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from PIL import Image
import torch.nn.functional as F
torch.cuda.empty_cache()

# this file is located in pytorch tutorial/image
# captioning which we pull from git remember
import sys
sys.path.append('C:/Deep Learning/Projects/Code')  # Add once at the start

# Now import normally
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Function to Load and Resize the Image
def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    if transform is not None:
        image = transform(image).unsqueeze(0)
    return image

# Model paths
ENCODER_PATH = 'C:/Deep Learning/Projects/Code/Image Caption Generation/Model/encoder-5-3000.pkl'
DECODER_PATH = 'C:/Deep Learning/Projects/Code/Image Caption Generation/Model/decoder-5-3000.pkl'
VOCAB_PATH = 'C:/Deep Learning/Projects/Code/Image Caption Generation/vocab.pkl'

# Constants
EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1

In [4]:
def beam_search(decoder, features, vocab, beam_width=5, max_len=20):
    """
    Implements beam search for better caption generation.
    
    Args:
        decoder: DecoderRNN model
        features: Encoded image features from CNN
        vocab: Vocabulary wrapper
        beam_width: Number of beams to keep track of
        max_len: Maximum length of the caption
        
    Returns:
        best_caption: The caption with highest probability
    """
    # Move tensors to the same device as the model
    device = next(decoder.parameters()).device
    
    k = beam_width
    
    # Initialize the first word as <start> for all k beams
    start_token = vocab.word2idx['<start>']
    end_token = vocab.word2idx['<end>']
    
    # Initialize sequence scores
    top_k_scores = torch.zeros(k, 1).to(device)
    # Generate k starting sequences with just the start token
    seqs = torch.full((k, 1), start_token, dtype=torch.long).to(device)
    
    # Lists to store completed sequences and their scores
    complete_seqs = []
    complete_seqs_scores = []
    
    # Initialize hidden and cell states
    states = None
    
    # Start beam search
    step = 1
    while True:
        if step == 1:
            # For the first step, we only have one sequence (the start token)
            # Forward pass through decoder
            embedded = decoder.embed(seqs[0].unsqueeze(0))
            if states is None:
                outputs, states = decoder.lstm(embedded, None)
            else:
                outputs, states = decoder.lstm(embedded, states)
            outputs = decoder.fc(outputs.squeeze(1))
            
            # Get top k words
            scores = F.log_softmax(outputs, dim=1)
            top_k_scores, top_k_words = scores[0].topk(k, dim=0)
            
            # Convert to tensors
            top_k_scores = top_k_scores.unsqueeze(1)
            
            # Create k sequences, each with the start token and one top word
            seqs = torch.full((k, 2), start_token, dtype=torch.long).to(device)
            seqs[:, 1] = top_k_words
            
            # Ensure states are properly expanded for k beams
            if isinstance(states, tuple):  # LSTM states
                # Each state is (h, c) where h and c are (num_layers, batch_size, hidden_size)
                h, c = states
                states = (h.expand(-1, k, -1).contiguous(), 
                            c.expand(-1, k, -1).contiguous())
            else:  # GRU states
                states = states.expand(-1, k, -1).contiguous()
        else:
            # Subsequent steps
            curr_input = seqs[:, -1]
            embedded = decoder.embed(curr_input)
            if isinstance(states, tuple):  # LSTM
                h, c = states
                batch_size = h.size(1)
            else:  # GRU
                batch_size = states.size(1)
            
            embedded = embedded.view(1, batch_size, -1)
            outputs, states = decoder.lstm(embedded, states)
            outputs = decoder.fc(outputs.squeeze(1))
            
            # Get scores for next words
            scores = F.log_softmax(outputs, dim=1)
            
            # Add the log probabilities to current scores
            scores = top_k_scores.expand_as(scores) + scores
            
            # For the first batch, we consider all k possibilities
            # For subsequent batches, we consider k^2 possibilities
            if step == 2:
                top_k_scores, top_k_words = scores[0].topk(k, dim=0)
            else:
                # Flatten all scores
                top_k_scores, top_k_words = scores.view(-1).topk(k, dim=0)
                
                # Get the beam indices and word indices
                prev_word_inds = top_k_words // scores.size(1)  # beam index
                next_word_inds = top_k_words % scores.size(1)   # word index
                
                # Update sequences
                new_seqs = []
                for i, w in enumerate(prev_word_inds):
                    new_seqs.append(torch.cat([seqs[w], next_word_inds[i].unsqueeze(0)], dim=0))
                
                seqs = torch.stack(new_seqs)
                
                # Update states based on beam indices
                if isinstance(states, tuple):  # LSTM
                    h, c = states
                    h = h.view(-1, batch_size, h.size(2))
                    c = c.view(-1, batch_size, c.size(2))
                    states = (h[:, prev_word_inds], c[:, prev_word_inds])
                else:  # GRU
                    states = states.view(-1, batch_size, states.size(2))
                    states = states[:, prev_word_inds]
        
        # Check for completed sequences
        is_end = seqs[:, -1] == end_token
        if step > 1:
            # Add complete sequences to our results
            for i in range(len(is_end)):
                if is_end[i]:
                    complete_seqs.append(seqs[i].tolist())
                    complete_seqs_scores.append(top_k_scores[i])
            
            # If all k beams are finished, exit
            k -= sum(is_end).item()
            if k == 0:
                break
                
            # Remove completed beams
            seqs = seqs[~is_end]
            top_k_scores = top_k_scores[~is_end]
            if isinstance(states, tuple):
                h, c = states
                states = (h[:, ~is_end], c[:, ~is_end])
            else:
                states = states[:, ~is_end]
        
        # Stop if we've reached the maximum length
        if step >= max_len:
            break
        
        # Increment step
        step += 1
    
    # If we have completed sequences, return the best one
    if complete_seqs:
        best_seq_idx = complete_seqs_scores.index(max(complete_seqs_scores))
        best_seq = complete_seqs[best_seq_idx]
    else:
        # If no sequence is complete, return the current best one
        best_seq_idx = top_k_scores.argmax().item()
        best_seq = seqs[best_seq_idx].tolist()
    
    # Convert sequence of indices to words
    words = []
    for idx in best_seq:
        # Skip start and pad tokens in the output
        if idx != vocab.word2idx['<start>'] and idx != vocab.word2idx['<pad>']:
            if idx == vocab.word2idx['<end>']:
                break
            words.append(vocab.idx2word[idx])
    
    return ' '.join(words)

In [5]:
# Modified PretrainedResNet function to use beam search
def PretrainedResNetBeamSearch(image_path, encoder_path=ENCODER_PATH,
                    decoder_path=DECODER_PATH,
                    vocab_path=VOCAB_PATH,
                    embed_size=EMBED_SIZE,
                    hidden_size=HIDDEN_SIZE,
                    num_layers=NUM_LAYERS,
                    beam_width=5):
    
    # Import the torch.nn.functional module if not imported
    import torch.nn.functional as F
    
    # Image preprocessing
    transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406),
                                    (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build models
    # eval mode (batchnorm uses moving mean/variance)
    encoder = EncoderCNN(embed_size).eval()
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    
    # Pass device as string directly
    if device == 'cuda':
        encoder = encoder.cuda()
        decoder = decoder.cuda()
    
    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path, map_location=device))
    decoder.load_state_dict(torch.load(decoder_path, map_location=device))
    
    # Prepare an image
    image = load_image(image_path, transform)
    # Move image to device
    if device == 'cuda':
        image_tensor = image.cuda()
    else:
        image_tensor = image
    
    # Generate a caption from the image
    feature = encoder(image_tensor)
    
    # Use beam search instead of greedy decoding
    sentence = beam_search(decoder, feature, vocab, beam_width=beam_width)
    
    # Apply title case to the sentence
    sentence = sentence.title()
    
    # Return the generated caption and the image
    image = Image.open(image_path)
    return sentence, image


In [6]:
plt.figure(figsize=(12,12))
image_path = 'C:/Deep Learning/coco/images/train2014/COCO_train2014_000000001424.jpg'
# Use the beam search version with beam width of 5
predicted_label, image = PretrainedResNetBeamSearch(image_path=image_path, beam_width=5)
plt.imshow(image)
print(predicted_label)



AttributeError: 'DecoderRNN' object has no attribute 'fc'

<Figure size 1200x1200 with 0 Axes>