In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cnn-lstm-for-rsicd/pytorch/default/1/best_attention_model.pth


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from collections import Counter, namedtuple
import json
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
from wordcloud import WordCloud
import textwrap
import torch.nn.functional as F
import re
from torchvision import transforms
import requests
import io
import textwrap

In [7]:
class Config:
    
    
    max_caption_length = 50
    embed_size = 300
    attention_size = 512
    hidden_size = 512
    num_layers = 1
    dropout = 0.3
    
    finetune_encoder = True
    batch_size = 64
    num_epochs = 40  
    learning_rate = 1e-4  
    lr_decay_patience = 3
    grad_clip = 5.0
    vocab_threshold = 5
    teacher_forcing_start = 1.0
    teacher_forcing_end = 0.5
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class EncoderCNN(nn.Module):
    def __init__(self, finetune=False):
        super().__init__()
        resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        self.feature_dim = resnet.fc.in_features

        for param in self.resnet.parameters():
            param.requires_grad = False

        if finetune:
            print("Encoder finetuning enabled.")
            for c in list(self.resnet.children())[6:]:
                for p in c.parameters():
                    p.requires_grad = True

    def forward(self, images):
        features = self.resnet(images)
        features = features.permute(0, 2, 3, 1)
        features = features.view(features.size(0), -1, self.feature_dim)
        return features

class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim):
        super().__init__()
        self.encoder_att = nn.Linear(encoder_dim, attention_dim)
        self.decoder_att = nn.Linear(decoder_dim, attention_dim)
        self.full_att = nn.Linear(attention_dim, 1)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, encoder_out, decoder_hidden):
        att1 = self.encoder_att(encoder_out)
        att2 = self.decoder_att(decoder_hidden)
        att = self.full_att(self.relu(att1 + att2.unsqueeze(1))).squeeze(2)
        alpha = self.softmax(att)
        attention_weighted_encoding = (encoder_out * alpha.unsqueeze(2)).sum(dim=1)
        return attention_weighted_encoding, alpha

class DecoderLSTMWithAttention(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, encoder_dim, attention_size, pretrained_embeddings=None):
        super().__init__()
        self.encoder_dim = encoder_dim
        self.embed = nn.Embedding(vocab_size, embed_size)
        if pretrained_embeddings is not None:
            self.embed.from_pretrained(pretrained_embeddings, freeze=True)

        self.attention = Attention(encoder_dim, hidden_size, attention_size)
        self.lstm = nn.LSTMCell(embed_size + encoder_dim, hidden_size)
        self.init_h = nn.Linear(encoder_dim, hidden_size)
        self.init_c = nn.Linear(encoder_dim, hidden_size)
        self.f_beta = nn.Linear(hidden_size, encoder_dim)
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, encoder_out, encoded_captions, caption_lengths):
        batch_size = encoder_out.size(0)
        vocab_size = self.fc.out_features
        encoder_out_flat = encoder_out.view(batch_size, -1, self.encoder_dim)
        embeddings = self.embed(encoded_captions)
        mean_encoder_out = encoder_out_flat.mean(dim=1)
        h = self.init_h(mean_encoder_out)
        c = self.init_c(mean_encoder_out)

        decode_lengths = [c - 1 for c in caption_lengths]
        predictions = torch.zeros(batch_size, max(decode_lengths), vocab_size).to(Config.device)
        alphas = torch.zeros(batch_size, max(decode_lengths), encoder_out_flat.size(1)).to(Config.device)

        for t in range(max(decode_lengths)):
            batch_size_t = sum([l > t for l in decode_lengths])
            attention_weighted_encoding, alpha = self.attention(encoder_out_flat[:batch_size_t], h[:batch_size_t])
            gate = self.sigmoid(self.f_beta(h[:batch_size_t]))
            attention_weighted_encoding = gate * attention_weighted_encoding
            h, c = self.lstm(
                torch.cat([embeddings[:batch_size_t, t, :], attention_weighted_encoding], dim=1),
                (h[:batch_size_t], c[:batch_size_t]))
            preds = self.fc(h)
            predictions[:batch_size_t, t, :] = preds
            alphas[:batch_size_t, t, :] = alpha

        return predictions, encoded_captions, decode_lengths, alphas

In [5]:
def generate_caption_with_attention(encoder, decoder, img_tensor, vocab):
    """Generates a caption, accepting loaded model objects."""
    with torch.no_grad():
        encoder_out = encoder(img_tensor)
        encoder_dim = encoder_out.size(-1)
        encoder_out = encoder_out.view(1, -1, encoder_dim)
        
        h = decoder.init_h(encoder_out.mean(dim=1))
        c = decoder.init_c(encoder_out.mean(dim=1))
        
        prev_word = torch.LongTensor([vocab.word2idx['<START>']]).to(Config.device)
        
        seq = []
        for _ in range(Config.max_caption_length):
            embeddings = decoder.embed(prev_word)
            awe, _ = decoder.attention(encoder_out, h)
            gate = decoder.sigmoid(decoder.f_beta(h))
            awe = gate * awe
            h, c = decoder.lstm(torch.cat([embeddings, awe], dim=1), (h, c))
            scores = decoder.fc(h)
            _, next_word_idx = torch.max(scores, dim=1)
            
            if next_word_idx.item() == vocab.word2idx['<END>']:
                break
            
            word = vocab.idx2word.get(next_word_idx.item(), vocab.idx2word[vocab.word2idx['<UNK>']])
            seq.append(word)
            prev_word = next_word_idx
            
    return ' '.join(seq)
    
def show_qualitative_results_attention(encoder, decoder, vocab, img_list, img_dir, captions_dict, num_samples=5):
    """Displays results, accepting loaded model objects."""
    print(f"\nDisplaying qualitative results for {num_samples} random images...")
    random_images = random.sample(img_list, num_samples)
    
    for i, img_name in enumerate(random_images):
        img_path = os.path.join(img_dir, img_name)
        img = Image.open(img_path)
        img_tensor = transforms.Compose([
            transforms.Resize((224, 224)), transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])(img).unsqueeze(0).to(Config.device)
        
        generated_caption = generate_caption_with_attention(encoder, decoder, img_tensor, vocab)
        
        plt.figure(figsize=(8, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.show()
        
        print("-" * 80)
        print(f"IMAGE: {img_name}")
        print(f"GENERATED CAPTION:\n  {generated_caption}\n")
        
        reference_captions = captions_dict[img_name]
        print("REFERENCE CAPTIONS:")
        for ref_cap in reference_captions:
            print(f"  - {ref_cap}")
        print("-" * 80)
        print("\n")

In [9]:
def clean_caption(caption):
    text = caption.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

class Vocabulary:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3}
        self.idx2word = {0: "<PAD>", 1: "<START>", 2: "<END>", 3: "<UNK>"}
        self.idx = 4

    def build_vocab(self, captions, threshold=5):
        counter = Counter()
        for caption in captions:
            tokens = caption.split()
            counter.update(tokens)
        words = [word for word, cnt in counter.items() if cnt >= threshold]
        for word in words:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

    def encode(self, text):
        tokens = text.split()
        return [self.word2idx.get(token, self.word2idx["<UNK>"]) for token in tokens]

In [10]:
def predict_caption(encoder, decoder, vocab, image_path_or_url):
    """
    Loads an image from a path or URL, generates a caption, and displays the result.
    
    Args:
        encoder (nn.Module): The trained encoder model.
        decoder (nn.Module): The trained decoder model.
        vocab (Vocabulary): The vocabulary object from training.
        image_path_or_url (str): The local file path or web URL of the image.
    """
    try:
        
        if image_path_or_url.startswith('http'):
            response = requests.get(image_path_or_url)
            response.raise_for_status() 
            img = Image.open(io.BytesIO(response.content)).convert("RGB")
        else:
            img = Image.open(image_path_or_url).convert("RGB")
            
       
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        img_tensor = transform(img).unsqueeze(0).to(Config.device)

        
        generated_caption = generate_caption_with_attention(encoder, decoder, img_tensor, vocab)

        
        plt.figure(figsize=(8, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.show()
        
        print("-" * 80)
        print(f"GENERATED CAPTION:\n  {generated_caption}")
        print("-" * 80)

    except requests.exceptions.RequestException as e:
        print(f"Error: Could not download image from URL. {e}")
    except FileNotFoundError:
        print(f"Error: Image file not found at path '{image_path_or_url}'")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")





print("--- LOADING TRAINED MODEL FOR INFERENCE ---")
model_path = '/kaggle/input/cnn-lstm-for-rsicd/pytorch/default/1/best_attention_model.pth'
checkpoint = torch.load(model_path, map_location=Config.device, weights_only=False)
vocab = checkpoint['vocab']

# Re-initialize models and load state dicts
encoder = EncoderCNN(finetune=False).to(Config.device)
decoder = DecoderLSTMWithAttention(Config.embed_size, Config.hidden_size, len(vocab), encoder.feature_dim, Config.attention_size).to(Config.device)
encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])
encoder.eval()
decoder.eval()
print("Model loaded successfully.\n")

--- LOADING TRAINED MODEL FOR INFERENCE ---


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 213MB/s]


Model loaded successfully.



In [None]:
predict_caption(encoder, decoder, vocab, image_path_or_url='/kaggle/input/testcheck/pytorch/default/1/Screenshot 2025-11-14 120719.png')