**LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torchvision
from torchvision import transforms
import cv2

from collections import Counter
from PIL import Image
import PIL

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as fun

import tensorflow
from tensorflow.keras import layers, models

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from tensorflow.keras import Model
from tensorflow.keras.layers import Add, GlobalAveragePooling2D,\
	Dense, Flatten, Conv2D, Lambda,	Input, BatchNormalization, Activation
from tensorflow.keras.optimizers import schedules, SGD
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()
from nltk.tokenize import word_tokenize

import match
import pickle
import gc
import random

**DATA Preprocessing** 

In [2]:
data = pd.read_csv("captions.txt", sep=',')
display(data)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


**Tokenization**

In [3]:
#Removes Single Char
def remove_single_char(caption_list):
    list = []
    for word in caption_list:
        if len(word)>1:
            list.append(word)
    return list

In [4]:
#Make an array of words out of caption and then remove useless single char words

data['caption'] = data['caption'].apply(lambda caption :word_tokenize(caption))

data['caption'] = data['caption'].apply(lambda word : remove_single_char(word))

#We need to make sure size of all the captions arrays is same so we add <cell> to cover up
lengths = []
lengths = data['caption'].apply(lambda caption : len(caption))

max_length = lengths.max()

data['caption'] = data['caption'].apply(lambda caption : ['<start>'] + caption + ['<cell>']*(max_length-len(caption)) + ['<end>'])

#For non truncated dataframe to appear
pd.set_option('display.max_colwidth', None)
display(data)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[<start>, child, in, pink, dress, is, climbing, up, set, of, stairs, in, an, entry, way, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
1,1000268201_693b08cb0e.jpg,"[<start>, girl, going, into, wooden, building, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
2,1000268201_693b08cb0e.jpg,"[<start>, little, girl, climbing, into, wooden, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
3,1000268201_693b08cb0e.jpg,"[<start>, little, girl, climbing, the, stairs, to, her, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
4,1000268201_693b08cb0e.jpg,"[<start>, little, girl, in, pink, dress, going, into, wooden, cabin, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
...,...,...
40450,997722733_0cb5439472.jpg,"[<start>, man, in, pink, shirt, climbs, rock, face, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
40451,997722733_0cb5439472.jpg,"[<start>, man, is, rock, climbing, high, in, the, air, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
40452,997722733_0cb5439472.jpg,"[<start>, person, in, red, shirt, climbing, up, rock, face, covered, in, assist, handles, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"
40453,997722733_0cb5439472.jpg,"[<start>, rock, climber, in, red, shirt, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]"


**Vocab and Dictionary**

In [5]:
#Extracting words 
words = data['caption'].apply(lambda word : " ".join(word)).str.cat(sep = ' ').split(' ')
display(words)

['<start>',
 'child',
 'in',
 'pink',
 'dress',
 'is',
 'climbing',
 'up',
 'set',
 'of',
 'stairs',
 'in',
 'an',
 'entry',
 'way',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<end>',
 '<start>',
 'girl',
 'going',
 'into',
 'wooden',
 'building',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<end>',
 '<start>',
 'little',
 'girl',
 'climbing',
 'into',
 'wooden',
 'playhouse',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',


In [6]:
#Arranging the words in order of their frequency
word_dict = sorted(Counter(words), key=Counter(words).get, reverse=True)

dict_size = len(word_dict)

print(len(word_dict))
display(word_dict)

9596


['<cell>',
 '<start>',
 '<end>',
 'in',
 'the',
 'on',
 'is',
 'and',
 'dog',
 'with',
 'man',
 'of',
 'Two',
 'white',
 'black',
 'are',
 'boy',
 'woman',
 'girl',
 'to',
 'The',
 'wearing',
 'at',
 'water',
 'red',
 'brown',
 'people',
 'young',
 'his',
 'blue',
 'dogs',
 'running',
 'through',
 'playing',
 'while',
 'an',
 'down',
 'shirt',
 'standing',
 'ball',
 'little',
 'grass',
 'snow',
 'child',
 'jumping',
 'over',
 'person',
 'front',
 'sitting',
 'holding',
 'field',
 'two',
 'up',
 'by',
 'green',
 'small',
 'yellow',
 'large',
 'her',
 'group',
 'walking',
 'Three',
 'into',
 'air',
 'beach',
 'men',
 'near',
 'one',
 'children',
 'mouth',
 'jumps',
 'another',
 'for',
 'street',
 'runs',
 'its',
 'from',
 'riding',
 'stands',
 'as',
 'bike',
 'girls',
 'outside',
 'other',
 'out',
 'rock',
 'next',
 'play',
 'off',
 'looking',
 'pink',
 'orange',
 'player',
 'their',
 'pool',
 'camera',
 'hat',
 'jacket',
 'around',
 'boys',
 'behind',
 'women',
 'background',
 'toy',
 '

In [7]:
#Encoding the words with index in dictionary made above
data['sequence'] = data['caption'].apply(lambda caption : [word_dict.index(word) for word in caption])

display(data)

Unnamed: 0,image,caption,sequence
0,1000268201_693b08cb0e.jpg,"[<start>, child, in, pink, dress, is, climbing, up, set, of, stairs, in, an, entry, way, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 43, 3, 90, 174, 6, 120, 52, 409, 11, 405, 3, 35, 5475, 714, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
1,1000268201_693b08cb0e.jpg,"[<start>, girl, going, into, wooden, building, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 18, 320, 62, 197, 118, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
2,1000268201_693b08cb0e.jpg,"[<start>, little, girl, climbing, into, wooden, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 40, 18, 120, 62, 197, 2490, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
3,1000268201_693b08cb0e.jpg,"[<start>, little, girl, climbing, the, stairs, to, her, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 40, 18, 120, 4, 405, 19, 58, 2490, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
4,1000268201_693b08cb0e.jpg,"[<start>, little, girl, in, pink, dress, going, into, wooden, cabin, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 40, 18, 3, 90, 174, 320, 62, 197, 3091, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
...,...,...,...
40450,997722733_0cb5439472.jpg,"[<start>, man, in, pink, shirt, climbs, rock, face, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 10, 3, 90, 37, 257, 85, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
40451,997722733_0cb5439472.jpg,"[<start>, man, is, rock, climbing, high, in, the, air, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 10, 6, 85, 120, 198, 3, 4, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
40452,997722733_0cb5439472.jpg,"[<start>, person, in, red, shirt, climbing, up, rock, face, covered, in, assist, handles, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 46, 3, 24, 37, 120, 52, 85, 123, 187, 3, 3701, 1763, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"
40453,997722733_0cb5439472.jpg,"[<start>, rock, climber, in, red, shirt, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <end>]","[1, 85, 374, 3, 24, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]"


In [8]:
#Training and Validation Data
#Same images but different captions

data = data.sort_values(by = 'image')
train, validation = train_test_split(data,test_size=0.1,train_size=0.9)

print(len(train), train['image'].nunique())
print(len(validation), validation['image'].nunique())

36409 8091
4046 3336


**ResNet18 Pretrained Model**

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [10]:
# resnet50 = torchvision.models.resnet50(pretrained=True).to(device)
# resnet50.eval()
# list(resnet50._modules)
# # for parameters in resnet50.parameters():
# #     parameters.requires_grad_(False)
# resnet50Layer4 = resnet50._modules.get('layer4').to(device)

**Extracting Features from Images**

In [26]:
preprocess = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])])

def get_image_tensor(index, preprocess, data):
    image_name = data.iloc[index]['image']
    img_loc = 'Images/'+str(image_name)
    img = Image.open(img_loc).convert('RGB')
    tensor_image = preprocess(img).unsqueeze(0)

    return tensor_image

**Encoder CNN**

In [33]:
class Encoder_CNN(nn.Module):
    def __init__(self, embed_size):
        super(Encoder_CNN, self).__init__()
        resnet50 = torchvision.models.resnet50(pretrained=True)
        # resnet50.eval()
        # for parameters in resnet50.parameters():
        #     parameters.requires_grad_(False)
        modules = list(resnet50.children())[:-1]
        self.model = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet50.fc.in_features, embed_size)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, image):
        features = self.model(image)
        features = features.view(features.size(0), -1)
        features = self.dropout(self.relu(self.embed(features)))
        return features

In [34]:
encoder = Encoder_CNN(256)
encoder.eval()

for i, row in data.iterrows():
    image_tensor = get_image_tensor(i, preprocess, data)
    image_tensor = image_tensor.to(device)

    print(f"Initial tensor shape: {image_tensor.shape}")

    with torch.no_grad():
        for name, module in encoder.model.named_children():
            image = module(image_tensor)
            print(f"After layer {name}: {image_tensor.shape}")

    
        output = image.view(image.size(0), -1)
        print(f"Output tensor shape: {output.shape}")
        
        embed_vector = encoder.embed(output)
        data[i]['embedded'] = embed_vector.cpu().numpy()

Initial tensor shape: torch.Size([1, 3, 224, 224])
After layer 0: torch.Size([1, 3, 224, 224])


RuntimeError: running_mean should contain 3 elements not 64