In [254]:
#IMPORTS
import string
from collections import Counter
import datetime as dt
import numpy as np
import pickle
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input

In [227]:
#EXTRACTING TEXT DATA AND PREPROCESSING

#STEP: EXTRACTING DATA
filepath = "../../../Downloads/Flickr8k/Flickr8k_text/Flickr8k.token.txt"
file = open(filepath, "r")
content = file.read()
file.close()
lines = content.split('\n')

#STEP: SAVING DATA IN A DICTIONARY FOR TIME EFFICIENCY PURPOSES
tokens_dic = {}
for line in lines:
    [image_, caption] = line.split('\t')
    image_title = image_.split('.')[0]
    if(image_title not in tokens_dic.keys()):
        tokens_dic[image_title] = [caption]
    else:
        tokens_dic[image_title].append(caption)

#STEP: PREPROCESSING
table = str.maketrans("", "", string.punctuation)
max_caption_length = 0
total_words_list = []
for captions in tokens_dic.values():
    for i in range(len(captions)):
        caption = captions[i]
        words = caption.split()
        words = [word.lower() for word in words]
        words = [word.translate(table) for word in words]
#         words = [word for word in words if len(word)>1]
        words = [word+' ' for word in words if word.isalpha()]
        total_words_list.extend(words)
        captions[i] = ''.join(words)
        max_caption_length = max(max_caption_length, len(words))
#NOTE: MAX CAPTION LENGTH BY LETTING 1 LENGTH WORDS BE IN CAPTION IS 35
#NOTE: MAX CAPTION LENGTH BY NOT LETTING 1 LENGTH WORDS BE IN CAPTION IS 32
print(max_caption_length)

35


In [228]:
#CONSTRUCTING VOCABULARY FROM CAPTIONS
vocabulary = set()
for captions in tokens_dic.values():
    for caption in captions:
        for word in caption.split():
            vocabulary.add(word)
#NOTE: SIZE OF VOCABULARY BY LETTING 1 LENGTH WORDS BE IN CAPTION IS 8775
#NOTE: SIZE OF VOCABULARY BY NOT LETTING 1 LENGTH WORDS BE IN CAPTION IS 8763

#CONSTRUCTING MOST PROBABLE VOCABULARY FROM ALL WORDS
counter = Counter(total_words_list)
commons = counter.most_common()
most_probable_vocabulary = set()
for tup in commons:
    if(tup[1]>=10):
        most_probable_vocabulary.add(tup[0])
#NOTE: SIZE OF MOST PROBABLE VOCABULARY BY LETTING 1 LENGTH WORDS BE IN CAPTION IS 1950
#NOTE: SIZE OF MOST PROBABLE VOCABULARY BY NOT LETTING 1 LENGTH WORDS BE IN CAPTION IS 1947

In [229]:
#LOADING TRAINING SET

#STEP: EXTRACTING DATA
filepath = "../../../Downloads/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt"
file = open(filepath, "r")
content = file.read()
file.close()
lines = content.split('\n')

#STEP: SAVING DATA IN A LIST
train = []
for line in lines:
    [image_title, stuff] =  line.split('.')
    train.append(image_title)
#NOTE: SIZE OF TRAIN LIST IS 6000

#STEP: SAVING IMAGE-CAPTIONS IN TRAIN DATASET
train_dataset = {}
for image_title, captions in tokens_dic.items():
    if(image_title in train and image_title not in train_dataset.keys()):
        train_dataset[image_title] = list()
        for caption in captions:
            refined_caption = "startseq " + caption + "endseq"
            train_dataset[image_title].append(refined_caption)
#NOTE: SIZE OF TRAIN DATASET IS 6000

In [236]:
#LOADING TESTING SET

#STEP: EXTRACTING DATA
filepath = "../../../Downloads/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt"
file = open(filepath, "r")
content = file.read()
file.close()
lines = content.split('\n')

#STEP: SAVING DATA IN A LIST
test = []
for line in lines:
    [image_title, stuff] =  line.split('.')
    test.append(image_title)
#NOTE: SIZE OF TEST LIST IS 1000

#STEP: SAVING IMAGE-CAPTIONS IN TRAIN DATASET
test_dataset = {}
for image_title, captions in tokens_dic.items():
    if(image_title in test and image_title not in test_dataset.keys()):
        test_dataset[image_title] = list()
        for caption in captions:
            refined_caption = "startseq " + caption + "endseq"
            test_dataset[image_title].append(refined_caption)
#NOTE: SIZE OF TEST DATASET IS 1000

In [248]:
#TO GET INPUT VECTOR X FROM IMAGES WE USE TRANSFER LEARNING THROUGH INCEPTIONV3 MODEL TRAINED ON 1000 DIFFERENT CLASSES OF IMAGES
model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)
encoded_train_images = {}
start = dt.datetime.now()
i = 0
for image_title in train_dataset.keys():
    image_path = "../../../Downloads/Flickr8k/Flicker8k_dataset/"+image_title+".jpg"
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    feature_vec = model_new.predict(x)
    feature_vec = np.reshape(feature_vec, feature_vec.shape[1])
    encoded_train_images[image_title] = feature_vec
    present = dt.datetime.now()
    if((i/500).is_integer()):
        print("{}th ITERATION".format(i))
        print("TIME SO FAR: {}s".format((present-start).total_seconds()))
    i+=1
end = dt.datetime.now()
print("TIME TAKEN TO ENCODE TRAIN IMAGES INTO 2048 LENGTH FEATURE VECTOR: {}s".format((end-start).total_seconds()))

0th ITERATION
TIME SO FAR: 8.841215s
500th ITERATION
TIME SO FAR: 218.930252s
1000th ITERATION
TIME SO FAR: 359.79328s
1500th ITERATION
TIME SO FAR: 455.496382s
2000th ITERATION
TIME SO FAR: 549.198238s
2500th ITERATION
TIME SO FAR: 644.261416s
3000th ITERATION
TIME SO FAR: 744.530409s
3500th ITERATION
TIME SO FAR: 848.340382s
4000th ITERATION
TIME SO FAR: 954.615744s
4500th ITERATION
TIME SO FAR: 1063.575684s
5000th ITERATION
TIME SO FAR: 1175.704234s
5500th ITERATION
TIME SO FAR: 1292.065183s
TIME TAKEN TO ENCODE TRAIN IMAGES INTO 2048 LENGTH FEATURE VECTOR: 1406.20231s


In [249]:
#TO GET INPUT VECTOR X FROM IMAGES WE USE TRANSFER LEARNING THROUGH INCEPTIONV3 MODEL TRAINED ON 1000 DIFFERENT CLASSES OF IMAGES
model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)
encoded_test_images = {}
start = dt.datetime.now()
i = 0
for image_title in test_dataset.keys():
    image_path = "../../../Downloads/Flickr8k/Flicker8k_dataset/"+image_title+".jpg"
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    feature_vec = model_new.predict(x)
    feature_vec = np.reshape(feature_vec, feature_vec.shape[1])
    encoded_test_images[image_title] = feature_vec
    present = dt.datetime.now()
    if((i/500).is_integer()):
        print("{}th ITERATION".format(i))
        print("TIME SO FAR: {}s".format((present-start).total_seconds()))
    i+=1
end = dt.datetime.now()
print("TIME TAKEN TO ENCODE TEST IMAGES INTO 2048 LENGTH FEATURE VECTOR: {}s".format((end-start).total_seconds()))

0th ITERATION
TIME SO FAR: 5.82063s
500th ITERATION
TIME SO FAR: 120.00942s
TIME TAKEN TO ENCODE TEST IMAGES INTO 2048 LENGTH FEATURE VECTOR: 234.424786s


In [256]:
#WRITING ENCODED TRAIN IMAGES AND ENCODED TEST IMAGES TO PKL FILES

with open("encoded_train_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoded_train_images, encoded_pickle)
    
with open("encoded_test_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoded_test_images, encoded_pickle)