In [None]:
# !pip install nltk -q
# !pip install rouge -q
# !pip install git+https://github.com/ruotianluo/cider.git
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge


In [None]:
import os
import pickle
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm
from transformers import T5Tokenizer, T5Model
from torch import nn
import torch.nn.functional as F


# Replace VGG16 with ResNet50
base_model = torchvision.models.resnet50(pretrained=True)
base_model = nn.Sequential(*list(base_model.children())[:-1])  # Remove the last layer


# Replace LSTM with T5 transformer
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5Model.from_pretrained('t5-small')


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
base_model.to(device)
t5_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5Model(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

# Load the JSON file
with open('/content/drive/MyDrive/Remote_Sensing_Image_Analysis/dataset/RSICD/dataset_rsicd.json', 'r') as f:
    data = json.load(f)

# Prepare data for TXT
txt_data = []

# Add headers to the text data
txt_data.append("images,sentences")

# Iterate through each entry
for entry in data['images']:
    filename = entry['filename']
    sentences = [sent['raw'] for sent in entry['sentences'][:5]]
    sentences = [sent.rstrip() for sent in sentences]  # Remove trailing whitespace

    # Append filename and sentences to txt_data
    for sentence in sentences:
        txt_data.append(f"{filename},{sentence}")

# Write data to TXT file
txt_file = 'captions.txt'
with open(txt_file, 'w') as f:
    # Write data
    f.write('\n'.join(txt_data))

import pandas as pd

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Remote_Sensing_Image_Analysis/sys/evl.csv")

# Define the column names
columns = ['epoch', 'dataset', 'Bleu1', 'Bleu2', 'Bleu3', 'Bleu4', 'Meteor', 'Rouge', 'Cider']

# Rename columns of the DataFrame
df.columns = columns

# Function to filter and print results based on epoch and dataset name
def eval(epochs, dataset):
    filtered_data = df[(df['epoch'] == epochs) & (df['dataset'] == dataset)]
    if not filtered_data.empty:
        filtered_data_str = filtered_data[['Bleu1', 'Bleu2', 'Bleu3', 'Bleu4', 'Meteor', 'Rouge', 'Cider']].to_string(index=False)
        print(filtered_data_str)

In [None]:
BASE_DIR = '/content/drive/MyDrive/Remote_Sensing_Image_Analysis/dataset/RSICD'
WORKING_DIR = '/content/drive/MyDrive/Remote_Sensing_Image_Analysis'
dataset="RSICD"

In [None]:
import torchvision.models as models
from torchvision.models import ResNet50_Weights

# Set device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Load pre-trained ResNet50 model
base_model = models.resnet50(weights=ResNet50_Weights.DEFAULT)


# Restructure the model
# Remove the last layer (fully connected) and use the second last layer's output
model = nn.Sequential(*list(base_model.children())[:-1])


# Move the model to the device (GPU if available)
model.to(device)


# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# ||  One Time Only ||


# from torchvision import models, transforms
# from PIL import Image
# from tqdm import tqdm
# import os

# # extract features from image
# features = {}
# directory = os.path.join(BASE_DIR,'RSICD_images')

# # Define the preprocessing transform
# transform = transforms.Compose([
#     transforms.Resize(224),
#     transforms.CenterCrop(224),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

# for img_name in tqdm(os.listdir(directory)):
#     # load the image from file
#     img_path = os.path.join(directory, img_name)
#     image = Image.open(img_path).convert('RGB')

#     # Preprocess the image
#     image = transform(image)
#     image = image.unsqueeze(0)

#     # Move image to the same device as your model
#     image = image.to(device)

#     # Extract features (assuming your model is named 'model')
#     with torch.no_grad():
#         feature = model(image)

#     # Convert feature tensor to numpy array and remove batch dimension
#     features[img_name] = feature.cpu().numpy().squeeze(0)

In [None]:
# store features in pickle
# with open(os.path.join(WORKING_DIR, 'RSICD_RESNET50_features.pkl'), 'wb') as f:
#     pickle.dump(features, f)

In [None]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'RSICD_RESNET50_features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
with open(os.path.join('captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# Create mapping of image to captions
mapping = {}
for line in tqdm(captions_doc.split('\n')):
    if len(line) < 2:
        continue
    tokens = line.split(',')
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = ", ".join(caption).strip()

    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

# Optional: Convert features to PyTorch tensors
features = {image_id: torch.tensor(feature) for image_id, feature in features.items()}

100%|██████████| 54605/54605 [00:00<00:00, 382869.69it/s]


In [None]:
len(mapping)

10921

In [None]:
import re

def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            # Convert to lowercase
            caption = caption.lower()
            # Remove digits, special characters, and non-alphabetic words
            caption = re.sub(r'[^a-z\s]', '', caption)
            # Replace multiple spaces with a single space
            caption = re.sub(r'\s+', ' ', caption)
            # Remove short words (length 1)
            caption = ' '.join([word for word in caption.split() if len(word) > 1])
            # Add start and end tags
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

In [None]:
# before preprocess of text
mapping['00005']

['a huge field with trees and plants surrounded while a couple of small playgrounds .',
 'an oval court and two-way street nearby .',
 'many buildings and some green trees are around three playgrounds in different sizes .',
 'a large surrounded is surroundededed by green trees and two small playgrounds is surroundededed by buildings .',
 'three playgrounds are surrounded by many trees and buildings .']

In [None]:
clean(mapping)

In [None]:
# before preprocess of text
mapping['00005']

['startseq huge field with trees and plants surrounded while couple of small playgrounds endseq',
 'startseq an oval court and twoway street nearby endseq',
 'startseq many buildings and some green trees are around three playgrounds in different sizes endseq',
 'startseq large surrounded is surroundededed by green trees and two small playgrounds is surroundededed by buildings endseq',
 'startseq three playgrounds are surrounded by many trees and buildings endseq']

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')
vocab_size = tokenizer.vocab_size

In [None]:
max_length = 128  # or any other suitable value
tokenizer.max_length = max_length

In [None]:
image_ids = list(mapping.keys())
train_size = int(len(image_ids) * 0.90)
train_ids, test_ids = torch.utils.data.random_split(image_ids, [train_size, len(image_ids) - train_size])

train = train_ids
test = test_ids

In [None]:
def data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    while True:
        for i in range(0, len(train), batch_size):
            batch_ids = train[i:i+batch_size]
            batch_caps = []
            for id in batch_ids:
                captions = mapping[id]
                batch_caps.extend(captions)
            batch_seq = tokenizer.batch_encode_plus(batch_caps,
                                                     max_length=max_length,
                                                     padding='max_length',
                                                     truncation=True,
                                                     return_tensors='pt')
            yield batch_seq['input_ids'].squeeze()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
import time



# Define the batch size
batch_size = 4

# Define the custom dataset class
class CaptionDataset(Dataset):
    def __init__(self, train, mapping, features, tokenizer, max_length):
        self.train = train
        self.mapping = mapping
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.train)

    def __getitem__(self, idx):
        id = self.train[idx]
        img = self.features[id]
        caps = self.mapping[id]
        seq = self.tokenizer.batch_encode_plus(caps, max_length=self.max_length, padding='max_length', truncation=True)
        seq = torch.tensor(seq['input_ids'])
        return seq  # Removed img from return statement

# Create the dataset and data loader
dataset = CaptionDataset(train, mapping, features, tokenizer, max_length)
data_loader = DataLoader(dataset, batch_size=batch_size)

# Define the model
# Define the model
class CaptionModel(torch.nn.Module):
    def __init__(self):
        super(CaptionModel, self).__init__()
        self.decoder = T5ForConditionalGeneration.from_pretrained('t5-small')

    def forward(self, cap):
      if self.training:
          outputs = self.decoder(input_ids=cap.view(-1, cap.size(-1)), labels=cap.view(-1, cap.size(-1)))
          return outputs.logits[:, :-1, :].contiguous(), cap.view(-1, cap.size(-1))[:, 1:].contiguous()
      else:
          batch_size, seq_length = cap.shape
          outputs = self.decoder.generate(input_ids=cap.unsqueeze(1), max_length=seq_length)
          return outputs

# Initialize the model, optimizer, and loss function
model = CaptionModel()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()


start_time = time.time()

for epoch in range(3):
    print(f"Epoch {epoch+1}/3")
    for batch in data_loader:
        optimizer.zero_grad()
        batch = batch.squeeze()
        outputs, targets = model(batch)
        targets = torch.nn.functional.one_hot(targets, num_classes=32128).float()
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        print(f"Batch processing time: {time.time() - start_time:.2f} seconds")

Epoch 1/3
Batch processing time: 12.80 seconds
Batch processing time: 29.05 seconds
Batch processing time: 43.42 seconds
Batch processing time: 56.99 seconds
Batch processing time: 70.36 seconds
Batch processing time: 83.68 seconds
Batch processing time: 96.85 seconds
Batch processing time: 110.30 seconds
Batch processing time: 123.93 seconds
Batch processing time: 137.00 seconds
Batch processing time: 150.24 seconds
Batch processing time: 163.91 seconds
Batch processing time: 177.94 seconds
Batch processing time: 191.73 seconds
Batch processing time: 205.20 seconds
Batch processing time: 217.79 seconds
Batch processing time: 230.67 seconds
Batch processing time: 244.11 seconds
Batch processing time: 257.54 seconds
Batch processing time: 270.94 seconds
Batch processing time: 284.35 seconds
Batch processing time: 297.84 seconds
Batch processing time: 311.50 seconds
Batch processing time: 325.50 seconds
Batch processing time: 339.60 seconds
Batch processing time: 353.71 seconds
Batch pro

In [None]:
# save the model
model.save(WORKING_DIR+'/RSICD__RESNET_best_model.h5')