In [9]:
import torch
import numpy as np

import torch.nn as nn
from torchvision import transforms
from torch.nn.utils.rnn import pack_padded_sequence
from PIL import Image


from datasets import Flickr8k_Images, Flickr8k_Features
from models import DecoderRNN, EncoderCNN
from utils import *
from config import *

# if false, train model; otherwise try loading model from checkpoint and evaluate
EVAL = False


# reconstruct the captions and vocab, just as in extract_features.py
lines = read_lines(TOKEN_FILE_TRAIN)
image_ids, cleaned_captions = parse_lines(lines)
vocab = build_vocab(cleaned_captions)

In [10]:
# device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# initialize the models and set the learning parameters
decoder = DecoderRNN(EMBED_SIZE, HIDDEN_SIZE, len(vocab), NUM_LAYERS).to(device)

In [11]:
features = torch.load('features.pt', map_location=device)
print("Loaded features", features.shape)

Loaded features torch.Size([7089, 2048])


In [12]:
features = features.repeat_interleave(5, 0)
print("Duplicated features", features.shape)

Duplicated features torch.Size([35445, 2048])


In [13]:
dataset_train = Flickr8k_Features(
    image_ids=image_ids,
    captions=cleaned_captions,
    vocab=vocab,
    features=features,
)

In [17]:
train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=64, # change as needed
    shuffle=True,
    num_workers=0, # may need to set to 0
    collate_fn=caption_collate_fn, # explicitly overwrite the collate_fn
)

In [25]:
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(decoder.parameters(), lr=LR)

print(len(image_ids))
print(len(cleaned_captions))
print(features.shape)

35445
35445
torch.Size([35445, 2048])


In [1]:
"""
COMP5623M Coursework on Image Caption Generation


Forward pass through Flickr8k image data to extract and save features from
pretrained CNN.

"""


import torch
import numpy as np

import torch.nn as nn
from torchvision import transforms

from models import EncoderCNN
from datasets import Flickr8k_Images
from utils import *
from config import *
import string



lines = read_lines(TOKEN_FILE_TRAIN)
# see what is in lines
# print(lines[:2])

#########################################################################
#
#       QUESTION 1.1 Text preparation
# 
#########################################################################

image_ids, cleaned_captions = parse_lines(lines)
# to check the results after writing the cleaning function
# print(image_ids[:2])
# print(cleaned_captions[:2])

vocab = build_vocab(cleaned_captions)
# to check the results
# print("Number of words in vocab:", vocab.idx)

# sample each image once
image_ids = image_ids[::5]


# crop size matches the input dimensions expected by the pre-trained ResNet
data_transform = transforms.Compose([ 
    transforms.Resize(224), 
    transforms.CenterCrop(224), 
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),   # using ImageNet norms
                         (0.229, 0.224, 0.225))])

dataset_train = Flickr8k_Images(
    image_ids=image_ids,
    transform=data_transform,
)

train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=64,
    shuffle=False,
    num_workers=2,
)

# device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EncoderCNN().to(device)



#########################################################################
#
#        QUESTION 1.2 Extracting image features
# 
#########################################################################
features = []


# TODO loop through all image data, extracting features and saving them
# no gradients needed
with torch.no_grad():
    for data in train_loader:
        inputs = data
        inputs = inputs.to(device)
        outputs = model(inputs)
        features.append(outputs)

for i in range(len(features)):
    features[i] = features[i].squeeze()
    
features = torch.cat(features, dim=0)

# to check your results, features should be dimensions [len(train_set), 2048]
# convert features to a PyTorch Tensor before saving
print(features.shape)


# save features
torch.save(features, "features.pt")




torch.Size([7089, 2048])
