In [1]:
#Import all the required libraries
import os, glob, random
import pickle, time, json
import warnings
from collections import Counter

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

from nltk.tokenize import wordpunct_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

warnings.filterwarnings('ignore')
stopwords = stopwords.words('english')

2022-10-05 21:08:49.016002: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-05 21:08:49.202937: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-05 21:08:49.215674: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-05 21:08:49.215707: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:
with open("../../data/image_caption_mapped.json", "r") as r:
    data = json.loads(r.read())

In [3]:
# Import the dataset and read the text file into a seperate variable

text_file = "../../data/flikr8k/captions.txt"

def load_doc(filename):
    with open(filename,"r") as f:
        text = f.read()
    return text

doc = load_doc(text_file)
print(doc[:300])

image,caption
1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg,A girl going into a wooden building .
1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg,A little girl climbing


In [5]:
all_img_id= [x["image"] for x in data]
all_img_path= [x["path"] for x in data]
annotations= [x["caption"] for x in data]

df = pd.DataFrame(list(zip(all_img_id, all_img_path,annotations)),columns =['ID','Path', 'Captions'])
df.head()

Unnamed: 0,ID,Path,Captions
0,1000268201_693b08cb0e.jpg,../../data/flikr8k/Images/1000268201_693b08cb0...,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,../../data/flikr8k/Images/1000268201_693b08cb0...,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,../../data/flikr8k/Images/1000268201_693b08cb0...,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,../../data/flikr8k/Images/1000268201_693b08cb0...,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,../../data/flikr8k/Images/1000268201_693b08cb0...,A little girl in a pink dress going into a woo...


In [7]:
#Create a list which contains all the captions
annotations = annotations

#add the <start> & <end> token to all those captions as well
annotations = ["<start> " + x + " <end>" for x in annotations]

#Create a list which contains all the path to the images
all_img_path = all_img_path

print("Total captions present in the dataset: " + str(len(annotations)))
print("Total images present in the dataset: " + str(len(all_img_path)))

Total captions present in the dataset: 40455
Total images present in the dataset: 40455


In [8]:
#Create the vocabulary & the counter for the captions

vocabulary = {x for x in word_tokenize(" ".join(annotations)) if x not in stopwords}

val_count=Counter(x for x in word_tokenize(" ".join(annotations)) if x not in stopwords)

In [9]:
#Visualise the top 30 occuring words in the captions

val_count.most_common(30)

[('<', 161820),
 ('>', 161820),
 ('end', 80962),
 ('start', 80919),
 ('.', 36581),
 ('A', 22667),
 ('dog', 7984),
 ('man', 6829),
 ('Two', 4365),
 ('white', 3876),
 ('black', 3696),
 ('boy', 3442),
 (',', 3232),
 ('woman', 3228),
 ('girl', 3218),
 ('The', 3089),
 ('wearing', 3061),
 ('water', 2778),
 ('red', 2660),
 ('brown', 2475),
 ('people', 2446),
 ('young', 2432),
 ('blue', 2259),
 ('dogs', 2083),
 ('running', 2072),
 ('playing', 2008),
 ('shirt', 1806),
 ('standing', 1786),
 ('ball', 1779),
 ('little', 1625)]

In [11]:
# create the tokenizer

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, oov_token="UNK", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [12]:
tokenizer.get_config()

{'num_words': 5000,
 'filters': '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': 'UNK',
 'document_count': 0,
 'word_counts': '{}',
 'word_docs': '{}',
 'index_docs': '{}',
 'index_word': '{}',
 'word_index': '{}'}

In [13]:
# Create word-to-index and index-to-word mappings.

tokenizer.fit_on_texts(annotations)

config = tokenizer.get_config()

word_to_index = config["word_index"]
index_to_word = config["index_word"]

In [14]:
# Create a word count of tokenizer to visulize the Top 30 occuring words after text processing
counter = Counter(eval(config["word_counts"]))
counter.most_common(30)

[('<start>', 80910),
 ('<end>', 80910),
 ('a', 62992),
 ('in', 18986),
 ('the', 18419),
 ('on', 10745),
 ('is', 9345),
 ('and', 8862),
 ('dog', 8138),
 ('with', 7765),
 ('man', 7274),
 ('of', 6723),
 ('two', 5642),
 ('white', 3959),
 ('black', 3848),
 ('boy', 3581),
 ('are', 3504),
 ('woman', 3402),
 ('girl', 3328),
 ('to', 3176),
 ('wearing', 3062),
 ('at', 2915),
 ('people', 2883),
 ('water', 2790),
 ('red', 2691),
 ('young', 2630),
 ('brown', 2578),
 ('an', 2432),
 ('his', 2357),
 ('blue', 2279)]

In [15]:
# Pad each vector to the max_length of the captions ^ store it to a vairable

sequences = tokenizer.texts_to_sequences(annotations)
cap_vector = pad_sequences(sequences)

print("The shape of Caption vector is :" + str(cap_vector.shape))

The shape of Caption vector is :(40455, 41)
