**LIBRARIES**

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torchvision
from torchvision import transforms

from collections import Counter
from PIL import Image

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as fun

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download()
from nltk.tokenize import word_tokenize

import match
import pickle
import gc
import random

**DATA Preprocessing** 

In [49]:
data = pd.read_csv("captions.txt", sep=',')
display(data)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock face covered in assist handles .
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


**Tokenization**

In [50]:
#Removes Single Char
def remove_single_char(caption_list):
    list = []
    for word in caption_list:
        if len(word)>1:
            list.append(word)
    return list

In [51]:
#Make an array of words out of caption and then remove useless single char words

# data['caption'] = data['caption'].apply(lambda caption : ['<start>'] + [word.lower() if word.isalpha() else '' for word in caption] + ['<end>'])

data['caption'] = data['caption'].apply(lambda caption :word_tokenize(caption))

data['caption'] = data['caption'].apply(lambda word : remove_single_char(word))

#We need to make sure size of all the captions arrays is same so we add <cell> to cover up
lengths = []
lengths = data['caption'].apply(lambda caption : len(caption))

max_length = lengths.max()

data['caption'] = data['caption'].apply(lambda caption : caption + ['<cell>']*(max_length-len(caption)))

#For non truncated dataframe to appear
pd.set_option('display.max_colwidth', None)
display(data)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,"[child, in, pink, dress, is, climbing, up, set, of, stairs, in, an, entry, way, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
1,1000268201_693b08cb0e.jpg,"[girl, going, into, wooden, building, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
2,1000268201_693b08cb0e.jpg,"[little, girl, climbing, into, wooden, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
3,1000268201_693b08cb0e.jpg,"[little, girl, climbing, the, stairs, to, her, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
4,1000268201_693b08cb0e.jpg,"[little, girl, in, pink, dress, going, into, wooden, cabin, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
...,...,...
40450,997722733_0cb5439472.jpg,"[man, in, pink, shirt, climbs, rock, face, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
40451,997722733_0cb5439472.jpg,"[man, is, rock, climbing, high, in, the, air, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
40452,997722733_0cb5439472.jpg,"[person, in, red, shirt, climbing, up, rock, face, covered, in, assist, handles, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"
40453,997722733_0cb5439472.jpg,"[rock, climber, in, red, shirt, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]"


**Vocab and Dictionary**

In [52]:
#Extracting words 
words = data['caption'].apply(lambda word : " ".join(word)).str.cat(sep = ' ').split(' ')
display(words)

['child',
 'in',
 'pink',
 'dress',
 'is',
 'climbing',
 'up',
 'set',
 'of',
 'stairs',
 'in',
 'an',
 'entry',
 'way',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 'girl',
 'going',
 'into',
 'wooden',
 'building',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 'little',
 'girl',
 'climbing',
 'into',
 'wooden',
 'playhouse',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 '<cell>',
 

In [53]:
#Arranging the words in order of their frequency
word_dict = sorted(Counter(words), key=Counter(words).get, reverse=True)

dict_size = len(word_dict)

print(len(word_dict))
display(word_dict)

9594


['<cell>',
 'in',
 'the',
 'on',
 'is',
 'and',
 'dog',
 'with',
 'man',
 'of',
 'Two',
 'white',
 'black',
 'are',
 'boy',
 'woman',
 'girl',
 'to',
 'The',
 'wearing',
 'at',
 'water',
 'red',
 'brown',
 'people',
 'young',
 'his',
 'blue',
 'dogs',
 'running',
 'through',
 'playing',
 'while',
 'an',
 'down',
 'shirt',
 'standing',
 'ball',
 'little',
 'grass',
 'snow',
 'child',
 'jumping',
 'over',
 'person',
 'front',
 'sitting',
 'holding',
 'field',
 'two',
 'up',
 'by',
 'green',
 'small',
 'yellow',
 'large',
 'her',
 'group',
 'walking',
 'Three',
 'into',
 'air',
 'beach',
 'men',
 'near',
 'one',
 'children',
 'mouth',
 'jumps',
 'another',
 'for',
 'street',
 'runs',
 'its',
 'from',
 'riding',
 'stands',
 'as',
 'bike',
 'girls',
 'outside',
 'other',
 'out',
 'rock',
 'next',
 'play',
 'off',
 'looking',
 'pink',
 'orange',
 'player',
 'their',
 'pool',
 'camera',
 'hat',
 'jacket',
 'around',
 'boys',
 'behind',
 'women',
 'background',
 'toy',
 'dirt',
 'sits',
 'socc

In [54]:
#Encoding the words with index in dictionary made above
data['sequence'] = data['caption'].apply(lambda caption : [word_dict.index(word) for word in caption])

display(data)

Unnamed: 0,image,caption,sequence
0,1000268201_693b08cb0e.jpg,"[child, in, pink, dress, is, climbing, up, set, of, stairs, in, an, entry, way, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[41, 1, 88, 172, 4, 118, 50, 407, 9, 403, 1, 33, 5473, 712, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,1000268201_693b08cb0e.jpg,"[girl, going, into, wooden, building, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[16, 318, 60, 195, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,1000268201_693b08cb0e.jpg,"[little, girl, climbing, into, wooden, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[38, 16, 118, 60, 195, 2488, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,1000268201_693b08cb0e.jpg,"[little, girl, climbing, the, stairs, to, her, playhouse, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[38, 16, 118, 2, 403, 17, 56, 2488, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,1000268201_693b08cb0e.jpg,"[little, girl, in, pink, dress, going, into, wooden, cabin, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[38, 16, 1, 88, 172, 318, 60, 195, 3089, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...
40450,997722733_0cb5439472.jpg,"[man, in, pink, shirt, climbs, rock, face, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[8, 1, 88, 35, 255, 83, 121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
40451,997722733_0cb5439472.jpg,"[man, is, rock, climbing, high, in, the, air, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[8, 4, 83, 118, 196, 1, 2, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
40452,997722733_0cb5439472.jpg,"[person, in, red, shirt, climbing, up, rock, face, covered, in, assist, handles, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[44, 1, 22, 35, 118, 50, 83, 121, 185, 1, 3699, 1761, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
40453,997722733_0cb5439472.jpg,"[rock, climber, in, red, shirt, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>, <cell>]","[83, 372, 1, 22, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [58]:
#Training and Validation Data
#Same images but different captions

data = data.sort_values(by = 'image')
train, validation = train_test_split(data,test_size=0.1,train_size=0.9)

print(len(train), train['image'].nunique())
print(len(validation), validation['image'].nunique())

36409 8091
4046 3298


In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


**Extracting Features from Images using Pre-Trained ResNet**

In [69]:
def image_smoothing(image,sigmaX=10):
    image = np.array(image)
    image = cv2.resize(image, (512, 512))
    image = cv2.addWeighted(image,4, cv2.GaussianBlur( image , (0,0) , sigmaX) ,-4 ,128)
    image = PIL.Image.fromarray(image)
    return image

class ImageSmoothingTransform:
    def __call__(self, image):
        return image_smoothing(image)

In [70]:
transform = transforms.Compose([transforms.Resize([224,224]), ImageSmoothingTransform(), transforms.ToTensor(),transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]) ])

class extractImageFeatures():
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def images_extract(self, index):
        image_name = self.data.iloc[index]['image']
        image = Image.open('Images/' + str(image_name))

        tensor_image = self.transform(image)
        return image_name, tensor_image
        

In [73]:
train_dataset = extractImageFeatures(train[['image']].drop_duplicates())
train_dataloader = DataLoader(train_dataset, batch_size = 1, shuffle=False)

valid_dataset = extractImageFeatures(validation[['image']].drop_duplicates())
valid_dataloader = DataLoader(valid_dataset, batch_size = 1, shuffle=False)

In [74]:
resnet18 = torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
resnet18.eval()
list(resnet18._modules)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\asus/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100.0%


['conv1',
 'bn1',
 'relu',
 'maxpool',
 'layer1',
 'layer2',
 'layer3',
 'layer4',
 'avgpool',
 'fc']

In [None]:
resNet18Layer4 = resnet18._modules.get('layer4').to(device)

In [None]:
https://www.kaggle.com/code/hirotaka0122/feature-extraction-by-resnet
https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-build-a-resnet-from-scratch-with-tensorflow-2-and-keras.md
https://www.kaggle.com/code/dipanjandas96/image-caption-resnet-transformerdecoder-pytorch/notebook