In [106]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [116]:
import os 
import json

from PIL import Image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import *
from data import ImageCaptionDataset, generate_batch
from torch.utils.data import DataLoader
import torch


In [117]:
data_path = 'data'
imgs_path = "data/img"

In [118]:
captions_json = {f.replace('captions_','').replace('2014.json',''):f for f in os.listdir(data_path) if f.startswith('caption')}
captions_json

{'val': 'captions_val2014.json', 'train': 'captions_train2014.json'}

In [119]:
df_train, df_val = load_images(captions_json, imgs_path)

In [120]:
#plot_images(df_train, os.path.join(imgs_path,'train'))
#plot_images(df_val, os.path.join(imgs_path,'val'))

In [121]:
transform = get_transform()
transform

Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
    RandomCrop(size=(224, 224), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
)

In [122]:
dataset = ImageCaptionDataset(transform=transform, 
                              df=df_train, 
                              img_path= os.path.join(imgs_path, 'train')
                              )

In [123]:
dataset.char_to_id('<pad>')

1

In [124]:
ls = []
for idx, (_, labeltmp) in enumerate(dataset):
    ls.append(labeltmp)
    print(labeltmp.shape)
    if idx>4:
        break   

torch.Size([14])
torch.Size([12])
torch.Size([13])
torch.Size([14])
torch.Size([13])
torch.Size([11])


In [125]:
from torch.nn.utils.rnn import pad_sequence
pad_sequence(ls, padding_value=dataset.char_to_id('<pad>')).T#.shape

tensor([[ 2,  4,  5,  6,  7,  8,  9, 10, 11,  4, 12, 13, 14,  3],
        [ 2, 15, 16, 17,  4, 18, 19, 20, 21, 22, 14,  3,  1,  1],
        [ 2,  4, 23, 24, 25, 26, 27, 28, 29,  9, 30, 14,  3,  1],
        [ 2,  4, 31, 32,  4, 33,  8, 34, 35, 36, 17,  9, 37,  3],
        [ 2,  4, 38,  8, 39, 17,  4, 40, 35, 41, 42, 14,  3,  1],
        [ 2,  4, 43, 24, 44, 45,  4, 46, 47, 14,  3,  1,  1,  1]])

In [126]:
BATCH_SIZE = 4

In [127]:
train_loader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

In [128]:
for imgtmp, labeltmp in train_loader:
    print(imgtmp.shape)
    print(labeltmp.shape)
    break


torch.Size([4, 3, 224, 224])
torch.Size([4, 15])


# Model

In [11]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = resnet50(pretrained=True)
        for param in resnet.parameters():
            param.requires_grad_(False)
        
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size)
        self.batch= nn.BatchNorm1d(embed_size,momentum = 0.01)
        self.embed.weight.data.normal_(0., 0.02)
        self.embed.bias.data.fill_(0)
        
    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.batch(self.embed(features))
        return features

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/0*4cE8ZvhN7c_xQRgi.png)

In [12]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed_size= embed_size
        self.drop_prob= 0.2
        self.vocabulary_size = vocab_size
        self.lstm = nn.LSTM(self.embed_size, self.hidden_size , self.num_layers,batch_first=True)
        self.dropout = nn.Dropout(self.drop_prob)
        self.embed = nn.Embedding(self.vocabulary_size, self.embed_size)
        self.linear = nn.Linear(hidden_size, self.vocabulary_size)
        
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
    
    def forward(self, features, captions):
        embeddings = self.embed(captions)
        features = features.unsqueeze(1)
        embeddings = torch.cat((features, embeddings[:, :-1,:]), dim=1)
        hiddens, c = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

In [13]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
import math

In [18]:
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
transform_train = get_transform(mean=mean, std=std)

# Set the minimum word count threshold.
vocab_threshold = 6

# Specify the batch size.
batch_size = 10

['a', 'person', 'doing', 'a', 'trick', 'on', 'a', 'rail', 'while', 'riding', 'a', 'skateboard', '.']


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [19]:
SIDE_SIZE = 224
MEAN = [0.45, 0.45, 0.45]
STD = [0.225, 0.225, 0.225]
CROP_SIZE = 224