# cleansing captions

In [10]:
# project path
prpath = r'C:\Users\nbxyz\Desktop'

In [96]:
import string

In [97]:
#load file containing captions
def load_data(fname):
    #open file and read
    file = open(fname, 'r')
    read_txt = file.read()
    #close file
    file.close()
    return read_txt

In [117]:
# define a function to get descriptions for images in dataset
def extract_desc(file):
    #store in dict
    desc_map = {}
    
    #process each caption
    for cap in file.split('\n'):
        #get tokens
        tokens = cap.split()
        #ignore very small captions
        if len(cap) < 2:
            continue
        # mapping token0 : img_id and token1: img_desc
        img_id, img_desc = tokens[0], tokens[1:]
        #remove filename from img_id
        img_id = img_id.split('.')[0]
        #convert img_desc tokens to string format
        img_desc = ' '.join(img_desc)
        # create the list if needed
        if img_id not in desc_map:
            desc_map[img_id] = []
        # store description
        desc_map[img_id].append(img_desc)
    return desc_map

In [109]:
# cleaning descriptions extracted
def clean_desc(descriptions):
    # translation table for removal of punctuations
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [110]:
# converting image descriptions to vocabulary
def to_vocabulary(descriptions):
    # build a list of all description strings
    img_desc = set()
    for key in descriptions.keys():
        [img_desc.update(d.split()) for d in descriptions[key]]
    return img_desc

In [111]:
# save to file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
# start caption cleaning process

# get data
caption_data_path = prpath + '\img_caption_generator\dataset\captions.txt'
cap_data = load_data(caption_data_path)
# extract descriptions
img_desc = extract_desc(cap_data)
# clean descriptions
clean_desc(img_desc)
# get vocabulary
vocab = to_vocabulary(img_desc)
# save
save_descriptions(img_desc, 'descriptions.txt')

print('Loaded: %d ' % len(img_desc))
print('Vocabulary Size: %d' % len(vocab))

NameError: name 'load_data' is not defined

# generate feature vector for images

In [27]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [55]:
import os
import numpy as np
from PIL import Image
from pickle import dump
# Import pre-trained Xception model for extracting features
from keras.applications.xception import Xception

# for progress bar 
from tqdm import tqdm_notebook as tqdm

In [29]:
# Using Xception model to get feature vector from images
def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            image = image/127.5
            image = image - 1.0
            feature = model.predict(image)
            features[img] = feature
        return features

In [30]:
with tf.device('/GPU:0'):
    # extract features from images
    image_path = prpath + '\img_caption_generator\dataset\Images'
    features = extract_features(image_path)
    # dump pickel file containing features
    dump(features, open(prpath + '\img_caption_generator\features.pkl', 'wb'))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img in tqdm(os.listdir(directory)):


  0%|          | 0/8091 [00:00<?, ?it/s]

KeyboardInterrupt: 

# load dataset for training

In [2]:
# import required libraries
from numpy import array
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

In [5]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

In [6]:
# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    features = []
    # filter features
    for k in dataset:
        key = str(k)+'.jpg'
        features.append(all_features[key])
    return features

In [7]:
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [8]:
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [9]:
# calculate the length of the description with the most words
def get_max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [10]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

In [11]:
# define the captioning model
def define_model(vocab_size, max_length):
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [12]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    # loop for ever over images
    while 1:
        counter = 0
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[counter][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word
            counter += 1

# training and testing

In [25]:
train_data = prpath + '\img_caption_generator\dataset\\train.txt'
test_data = prpath + '\img_caption_generator\dataset\\test.txt'
desc_path = prpath + '\img_caption_generator\\descriptions.txt'
features_path = prpath + '\img_caption_generator\\features.pkl'

In [45]:
# load training dataset
train = load_set(train_data)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions(desc_path, train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features(features_path, train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_len = get_max_length(train_descriptions)
print('Description Length: %d' % max_len)

with tf.device('/GPU:0'):
    # define the model
    model = define_model(vocab_size, max_len)
    # train the model, run epochs manually and save after each epoch
    epochs = 20
    steps = len(train_descriptions)
    for i in range(epochs):
        # create the data generator
        generator = data_generator(train_descriptions, train_features, tokenizer, max_len, vocab_size)
        # fit for one epoch
        model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
        # save model
        model.save(prpath + '\img_caption_generator\model_' + str(i) + '.h5')

Dataset: 7000
Descriptions: train=7000
Photos: train=7000
Vocabulary Size: 8166
Description Length: 33
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 33)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 33, 256)      2090496     ['input_5[0][0]']                
                                                                                                  
 dropout_1 (Dropout)            (None, 2048)         0           ['input_4[0][0]']        

  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


 284/7000 [>.............................] - ETA: 47:53 - loss: 6.1034

KeyboardInterrupt: 

In [14]:
# import the libraries
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [15]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [16]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)


In [17]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

In [18]:
# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features

In [19]:
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [20]:
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [21]:
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [22]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [30]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [33]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    #counter = 0
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
        #counter += 1
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [34]:
# prepare tokenizer on train set

# load training dataset
train = load_set(train_data)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_len = get_max_length(train_descriptions)
print('Description Length: %d' % max_len)

Dataset: 7000
Descriptions: train=7000
Vocabulary Size: 8166
Description Length: 33


# test set evaluations

In [None]:
# load test data
test = load_set(test_data)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions(desc_path, test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features(features_path, test)
print('Photos: test=%d' % len(test_features))

# load the model
filename = r'C:\Users\nbxyz\Desktop\img_caption_generator\model_19.h5'
model = load_model(filename)
# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_len)