In [18]:
import os 
import torch
import torchvision
import torchvision.datasets as dset
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np 
import sklearn.metrics
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import pandas as pd 

cap = dset.CocoCaptions(root = 'train2014',
                        annFile = 'captions_train2014.json',
                        transform=transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]))


loading annotations into memory...
Done (t=0.63s)
creating index...
index created!


In [3]:
cap_val = dset.CocoCaptions(root = 'val2014',
                        annFile = 'captions_val2014.json',
                        transform=transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]))

loading annotations into memory...
Done (t=0.36s)
creating index...
index created!


In [17]:
print('Number of samples: ', len(cap))
img, target = cap[0] # load 4th sample

print("Image Size: ", img.size())
print(target)


Number of samples:  82783
Image Size:  torch.Size([3, 224, 224])
['Closeup of bins of food that include broccoli and bread.', 'A meal is presented in brightly colored plastic trays.', 'there are containers filled with different kinds of foods', 'Colorful dishes holding meat, vegetables, fruit, and bread.', 'A bunch of trays that have different food.']


In [2]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'densenet201', pretrained=True)
trainloader = torch.utils.data.DataLoader(cap,batch_size=32)


Using cache found in C:\Users\ASUSROG/.cache\torch\hub\pytorch_vision_v0.10.0


In [7]:
 valloader =  torch.utils.data.DataLoader(cap_val,batch_size=32)

In [None]:
class CNNencoder(nn.Module):
    def __init__(self):
        super(CNNencoder,self).__init__()
        self.densenet = models.densenet201(pretrained=True)
        self.densenet=torch.nn.Sequential(*(list(self.densenet.children())[:-1]))
    def forward(self, images):
        out = self.densenet(images)  
        
        return out

In [None]:
class lstmEncoder(nn.Module):
    def __init__(self):
        super(lstmEncoder,self).__init__()
        self.lstm=nn.LSTM(embed_size,hidden_size,num_layers)
        self.linear=nn.Linear(hidden_size,vocab_size)
        self.dropout=nn.Dropout(0.5)
    def forward(self,features,captions):
        embeddings=self.dropout(self.embed(captions))
        embeddings=torch.cat((features.unsqueeze(0),embeddings),dim=0)
        hiddens,_=self.lstm(embeddings)
        outputs=self.linear(hiddens)
        return outputs
    
        

In [1]:
import nltk
from collections import Counter
nltk.download('punkt')
import nltk
from collections import Counter
nltk.download('punkt')
from pycocotools.coco import COCO
class Vocabulary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0

  def add_word(self, word):
    if not word in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1

  def __len__(self):
    return len(self.word2idx)

  def encode(self, word):
    if not word in self.word2idx:
      return self.word2idx['[unk]']
    return self.word2idx[word]

  def decode(self, idx):
    return self.idx2word[idx]


def build_vocab(json='captions_train2014.json',
                threshold=4,
                max_words=15000):
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if i % 100000 == 0:
            print('[%d/%d] tokenized the captions.' % (i, len(ids)))

    words = counter.most_common(max_words - 5)
    words = [word for word, cnt in words if cnt >= threshold]

    vocab = Vocabulary()
    vocab.add_word('[pad]')
    vocab.add_word('[start]')
    vocab.add_word('[end]')
    vocab.add_word('[cls]')
    vocab.add_word('[unk]')

    for i, word in enumerate(words):
        vocab.add_word(word)
    print('total number of words in vocab:', vocab.__len__())
    return vocab, words


nltk_tokenizer, words = build_vocab()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUSROG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUSROG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=0.74s)
creating index...
index created!
[0/414113] tokenized the captions.
[100000/414113] tokenized the captions.
[200000/414113] tokenized the captions.
[300000/414113] tokenized the captions.
[400000/414113] tokenized the captions.
total number of words in vocab: 9949
