In [3]:
import string
import numpy as np
from pickle import dump, load
import os
from PIL import Image

from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding

# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
# load text file in memory
def load_doc(filename):
    # Opening the file as read only
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text
# get all imgs with their captions
def all_img_captions( filename):
    file = load_doc(filename)
    captions = file.split('\n') # colect row
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
#         print(img[:-2])
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions


In [4]:
#Data cleaning- lower casing, removing puntuations and words containing numbers
# captions = descriptions ( dictionary)
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    #dict : for key, value in captions.items()
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace('-',' ')
            desc = img_caption.split()
            #convert word to lowercase
            desc =[word.lower() for word in desc]
            #remove punctuation
            desc =[word.translate(table) for word in desc]
            #remove hanging 's and a 
            desc =[word for word in desc if len(word)>1]
            #remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            #convert back to string
            img_caption =" ".join(desc)
            captions[img][i] = img_caption # descriptions[img][i] = img_caption
    return captions
# text_vocabulary : tập hợp các từ vựng trong descriptions
def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab =set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

# All description in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key +'\t'+ desc)
    data ="\n".join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
    
# Set these path according to project folder in you system
dataset_text = 'C:/Users/PC/Documents/Flickr8k_text'
# dataset_images ='E:/Document_Python/picture/Flicker8k_Dataset'
dataset_images ='C:/Users/PC/Documents/Flicker8k_Dataset'
# print(os.listdir(dataset_images))
#we prepare our text data
filename ='Flickr8k_token.txt'
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions: ", len(descriptions))
# print(descriptions)
# clear descriptions
clean_descriptions = cleaning_text(descriptions)
# print(clean_descriptions)
# build vocabulary
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary:", len(vocabulary))
print(vocabulary)
# save
save_descriptions(clean_descriptions, 'descriptions.txt')

Length of descriptions:  8092
Length of vocabulary: 8763




In [5]:
# EXTRACT FEATURE OF IMAGES
def extract_features(directory):
    model = Xception( include_top = False, pooling ='avg')
    features ={}
    for img in tqdm(os.listdir(directory)):
        filename = directory + "/"+ img
        image = Image.open(filename)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis =0)
        #image = preprocess_input(image)
        image = image/127.5
        image = image -1.0
        
        feature = model.predict(image)
        features[img] = feature
    return features # return dictionary

#2048 feature vector
features = extract_features(dataset_images)
dump(features, open ('features.p','wb'))

Instructions for updating:
Colocations handled automatically by placer.


HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))




In [8]:
features = load( open('features.p','rb'))
print(features)

{'1000268201_693b08cb0e.jpg': array([[0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ]], dtype=float32), '1001773457_577c3a7d70.jpg': array([[0.        , 0.21514867, 0.00040347, ..., 0.20985585, 0.39448455,
        0.0368709 ]], dtype=float32), '1002674143_1b742ab4b8.jpg': array([[0.        , 0.06792142, 0.02688157, ..., 0.        , 0.04563359,
        0.056624  ]], dtype=float32), '1003163366_44323f5815.jpg': array([[0.21431103, 0.00188817, 0.24350882, ..., 0.20903358, 0.15304087,
        0.10112188]], dtype=float32), '1007129816_e794419615.jpg': array([[0.        , 0.09760144, 0.7316974 , ..., 0.00364152, 0.01882428,
        0.4823734 ]], dtype=float32), '1007320043_627395c3d8.jpg': array([[0.09296172, 0.        , 0.017042  , ..., 0.00123643, 0.2865616 ,
        0.06388022]], dtype=float32), '1009434119_febe49276a.jpg': array([[0.        , 0.        , 0.02652833, ..., 0.25827754, 0.261811  ,
        0.15566562]], dtype=float32), '1012212859_01547e3

        0.0802551 ]], dtype=float32)}


In [9]:
# LOAD DATASET FOR TRAIN THE MODEL
# load name of pictures
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split('\n')[:-1]
    return photos
#  return dictionary : {key = image_name, value = image_caption}
def load_clean_descriptions(filename, photos):
    # load filename descriptions
    file = load_doc(filename)
    descriptions ={}
    for line in file.split('\n'):
        words = line.split()
        if len(words)<1:
            continue
        image, caption_image = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] =[]
            desc = '<start>'+" ".join(caption_image)+'<end>'
            descriptions[image].append(desc)
    return descriptions
def load_features(photos):
    # load all features
    all_features = load( open('features.p','rb'))
    # select need feature
    features = {k: all_features[k] for k in photos}
    return features


filename = dataset_text +"/" +"Flickr_8k.trainImages.txt"
# train
train_imgs = load_photos(filename)
# print(train_imgs)
train_descriptions = load_clean_descriptions('descriptions.txt',train_imgs)
print(train_descriptions)
train_features = load_features(train_imgs)
# print(train_features)






In [10]:
# TOKENIZER VOCABULARY
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc =[]
    for key in descriptions.keys():
        [all_desc.append(k) for k in descriptions[key]]
    return all_desc
#creating tokenizer class 
#this will vectorise text corpus
#each integer will represent token in dictionary
from tensorflow.keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer
# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p','wb'))
vocab_size = len(tokenizer.word_index) +1
print(vocab_size)

# cal maximum length of descriptions (độ dài tối đa của 1 caption image)
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)
max_length = max_length(descriptions)
print(max_length)
description_list = dict_to_list(descriptions)

7577
32


In [11]:
#create input-output sequence pairs from the image description.
#data generator, used by model.fit_generator()
def create_sequences(tokenizers,max_length, desc_list, feature):
    X1, X2, y = list(), list(),list()
    # walk through each description for the image
    for desc in desc_list:
        #encode sequence
        seq = tokenizers.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1,len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen= max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
            
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)
def data_generator(descriptions, features, tokenizers,max_length):
    while 1:
        
        for key, description_list in descriptions.items():
            # retrieve photo features
            feature = features[key][0] #lấy tất cả features của bức ảnh trên 1 hàng
            input_image, input_sequence, out_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[input_image, input_sequence],out_word]
#You can check the shape of the input and output for your model
[a,b],c = next(data_generator(train_descriptions, features,tokenizer, max_length))
a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 7577))

In [22]:
# DEFINE CNN- RNN MODEL
from tensorflow.keras.utils import plot_model
# define caption model
def define_model (vocab_size, max_length):
    
    # features from the CNN model squeezed from 2048 to 256 nodes ( pictures)
    inputs1 = Input(shape=(2048,))
    fe1     = Dropout(0.5)(inputs1)
    fe2     = Dense(256, activation ='relu')(fe1)
    # model sequences LSTM (sequences)
    inputs2 = Input(shape= (max_length,))
    se1     = Embedding(vocab_size,256,mask_zero = True)(inputs2)
    se2     = Dropout(0.5)(se1)
    se3     = LSTM(256)(se2)
    
    # merge 2 models
    decode1 = add([fe2,se3])
    decode2 = Dense(256, activation='relu')(decode1)
    outputs  = Dense (vocab_size, activation ='softmax')(decode2)
    
     # tie it together [image, seq] [word]
    model = Model(inputs = [inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summary model
    print(model.summary())
    #plot_model(model, to_file ='model.png', show_shapes= True)
    
    return model
    

In [31]:
# TRAIN MODEL
print("Dataset:", len(train_imgs))
print( "Descriptions: train=: ", len( train_descriptions))
print("Photo: train =", len(train_features))
print("Vocabulary size:", vocab_size)
print("Descriptions length: ", max_length)

model = define_model(vocab_size, max_length)
epochs =10
steps = len(train_descriptions)
# making a directory models to save our models
os.mkdir("C:/Users/PC/Documents/models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features,tokenizer, max_length)
    model.fit_generator(generator, epochs=1,steps_per_epoch = steps, verbose =1)
    model.save("models/model_"+ str(i)+ ".h5")

Dataset: 6000
Descriptions: train=:  6000
Photo: train = 6000
Vocabulary size: 7577
Descriptions length:  32
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 32, 256)      1939712     input_23[0][0]                   
__________________________________________________________________________________________________
dropout_19 (Dropout)            (None, 2048)         0           input_22[0][0]                   


ValueError: Output of generator should be a tuple `(x, y, sample_weight)` or `(x, y)`. Found: [[array([[0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ],
       [0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ],
       [0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ],
       ...,
       [0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ],
       [0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ],
       [0.35922778, 0.0327193 , 0.03686446, ..., 0.13820325, 0.02857176,
        0.3070938 ]], dtype=float32), array([[   0,    0,    0, ...,    0,    0,    2],
       [   0,    0,    0, ...,    0,    2,   42],
       [   0,    0,    0, ...,    2,   42,    3],
       ...,
       [   0,    0,    0, ...,  169,  313,   64],
       [   0,    0,    0, ...,  313,   64,  195],
       [   0,    0,    0, ...,   64,  195, 2913]])], array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)]

In [13]:
import numpy as np
import argparse
from PIL import Image
import matplotlib.pyplot as plt
from pickle import dump, load
import cv2
# ap = argparse.ArgumentParser()
# ap.add_argument('-i', '--image', required=True, help="Image Path")
# args = vars(ap.parse_args())
# img_path = args['image']
img_path ='C:/Users/PC/Documents/Flicker8k_Dataset/10815824_2997e03d76.jpg'
def extract_features(filename,model):
    try:
        image = Image.open(filename)
    except:
        print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
    image = image.resize((299,299))
    image = np.array(image)
    # for images that has 4 channels, we convert them into 3 channels
    if image.shape[2] ==4 :
        image = image [...,:3]
    image = np.expand_dims(image, axis =0)
    image = image/ 127.5
    image = image -1
    feature = model.predict(image)
    return feature
def word_for_id( integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
def generate_desc(model, tokenizer, photo, max_length):
    in_text ='start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen = max_length)
        pred     = model.predict([photo,sequence], verbose =0)
        pred     = np.argmax(pred)
        word   = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += " " + word
        if word == 'end':
            break
    return in_text

#path = 'Flicker8k_Dataset/111537222_07e56d5a30.jpg'
max_length =32
tokenizer = load(open('tokenizer.p','rb'))
model = load_model('models/model_9.h5')
xception_model = Xception(include_top =False, pooling ='avg')

photo = extract_features(img_path, xception_model)
img = cv2.imread(img_path)

description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
while 1:
    cv2.imshow('image',img)
    cv2.putText(img, description[5:-3],(50,50),2,0.8,(0,255,255),2,cv2.LINE_AA)
    if cv2.waitKey(20) & 0xFF == 27:
        break
cv2.destroyAllWindows()




start man in black shirt is sitting on bench with his officer end
