In [76]:
import numpy as np
import pydot
import graphviz
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import AveragePooling2D, GlobalAveragePooling2D, Dense, Input
from tensorflow.keras.layers import Embedding, LSTM, Add

In [72]:
!pip install pydot

Collecting pydot
  Downloading pydot-1.4.2-py2.py3-none-any.whl (21 kB)
Installing collected packages: pydot
Successfully installed pydot-1.4.2


In [73]:
!pip install graphviz

Collecting graphviz
  Using cached graphviz-0.20-py3-none-any.whl (46 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.20


## Build CNN model with Pooling and Dense layers

### Test on one image

In [45]:
img_path = '../raw_data/images/10815824_2997e03d76.jpg'

In [46]:
img = image.load_img(img_path, target_size=(256,256,3))
x = image.img_to_array(img)

In [47]:
x = np.expand_dims(x, axis=0)
x.shape

(1, 256, 256, 3)

### CNN Model layers

# Main notebook for data processing in robo_romeo project

## Imports - this should do us for the whole project. 

In [None]:
import numpy as np
from PIL import Image
import os
import string
from pickle import dump
from pickle import load
import tensorflow as tf
from tensorflow.keras.applications.xception import Xception #to get pre-trained model Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer #for text tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense#Keras to build our CNN and LSTM
from tensorflow.keras.layers import LSTM, Embedding, Dropout
from tqdm import tqdm_notebook as tqdm #to check loop progress
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

## Data cleaning

 - load_doc( filename ) – To load the document file and read the contents of the file into a string.

In [None]:
# Load the document file into memory
def load_doc(filename):
    # Open file to read
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

 - load_descriptions(doc) – To create a description dictionary that will map images with all 5 captions.

In [None]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

 - clean_descriptions( descriptions) – to clean the data by taking all descriptions as input. This will perform several types of cleaning including uppercase to lowercase conversion, punctuation removal, and removal of the number containing words.



In [None]:
# clean descriptions
clean_descriptions(descriptions)

 - txt_vocab( descriptions ) – to create a vocabulary from all the unique words extracted out from descriptions.



In [None]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

 - save_descriptions( descriptions, filename ) – This function is used to store all the preprocessed descriptions into a file.



In [None]:
!pwd

In [None]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
         

In [None]:
filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')


# Extract feature vector

Xception model takes images in size 299x299x3. we need to edit the last calssification layer.
Extract_features() function extracts features for all images and puts the features dictionary into a pickle file named “features.p”.

In [None]:
def extract_features(directory):
    model = Xception( include_top=False, pooling='avg' )
    features = {}
    for pic in tqdm(os.listdir(dir)):
        file = dir + "/" + pic
        image = Image.open(file)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        features[image] = feature
    return features
#2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features.p","wb"))
#to directly load the features from the pickle file.
features = load(open("features.p","rb"))

# loading dataset for model training

Flickr_8k.trainImages.txt contains a list of 6000 image names that are used for training

Functions required to load the training datasets:

 - load_photos( fname ) – takes a file name as a parameter and return the list of image names by loading the text file into a string.

 - load_clean_descriptions( fname, image) – stores the captions for every image from the list of photos to a dictionary. For the ease of the LSTM model in identifying the beginning and ending of a caption, we append the and identifier with each caption. ('start' and 'end' tags at the beginning and end of each caption.

 - load_features(photos) – to return the extracted feature vectors from the Xception model and the dictionary for photos.

In [None]:
#load the data
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("n")[:-1]
    return photos

In [None]:
def load_clean_descriptions(filename, photos):
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("n"):
        words = line.split()
        if len(words)<1 :
            continue
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
                desc = ' ' + " ".join(image_caption) + ' '
                descriptions[image].append(desc)
    return descriptions

In [None]:
def load_features(photos):
    #loading all features
    all_features = load(open("features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [None]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [37]:
inputs1 = Input(shape=(256,256,3))

In [38]:
CNN_model = EfficientNetB0(
    include_top=False, # Whether to include the fully-connected layer at the top of the network
    weights='imagenet', # pre-trained weights on ImageNet
    input_tensor=None,
    input_shape= (256,256,3), # It should have exactly 3 inputs channels
    pooling=None # Optional pooling mode for feature extraction when include_top is False
)(inputs1)

In [63]:
pooling = GlobalAveragePooling2D()(CNN_model)
cnn_dense = Dense(256, activation='relu')(pooling)
model1 = Model(inputs=inputs1, outputs=cnn_dense)

In [64]:
model1.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetb0 (Functional)  (None, 8, 8, 1280)       4049571   
                                                                 
 global_average_pooling2d_5   (None, 1280)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_4 (Dense)             (None, 256)               327936    
                                                                 
Total params: 4,377,507
Trainable params: 4,335,484
Non-trainable params: 42,023
_________________________________________________________________


## Combine with LSTM sequence model

### LSTM Model layers

In [55]:
max_caption_length = 32+1
vocab_size = 8763+2

In [59]:
inputs2 = Input(shape=(max_caption_length,))
embed_layer = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
lstm_layer = LSTM(256)(embed_layer)

### Combine CNN and LSTM

In [78]:
decoder1 = Add()([cnn_dense,lstm_layer])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

### Model summary

In [79]:
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

In [82]:
plot_model(model,show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.
