In [22]:
import numpy as np
import pandas as pd
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model
from keras.utils import plot_model
import time
import pickle

from keras.utils import to_categorical
from scipy.sparse import csr_matrix

# Create "image embeddings"

In order to use the images to learn captionings, we need to combine the power of Convolutional Neural Networs (CNN) and Recurrent Neural Networks (RNN)

It's not immediately clear how to do this. But, a common technique is to use a pretrained deep neural network, such as VGG16 or Inception, run said network on the training images, and extract the activations at a given layer to produce one vector per image. This vector can be thought of as an "image embedding."

The rationale is that deep networks like VGG16, which perform well on the ImageNet classification task, are capable of extracting very complex features from an image. Thus, if we take off the last couple of layers (which are responsible for the classification of an image to one of the ImageNet classes), then the vector we get will encode interesting features in our images. 

Here, I extract image embeddings

## 1. Initialize an abridged VGG16 model

First, I'll take a pre-trained VGG16 model, trained on the ImageNet dataset. Then I'll take off the last two layers, so that I can keep just the feature extracting mechanism and remove the part that is responsible for the ImageNet classification. 

In [2]:
model = VGG16()

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [16]:
mkdir images

mkdir: images: File exists


In [21]:
plot_model(model, "images/vgg16.png", show_shapes=True)

![](images/vgg16.png)

In [3]:
# take off two layers from the model
model.layers.pop()
model.layers.pop()
model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

## 2. Functions for feature extraction

Given the name of a csv file, which is assumed to have a column *photo_id*, create and return a dictionary of {image_id : [extracted features]} pairs. The extracted features will be vectors of dimension 4096. 

In [4]:
def extract_photo_features(list_file):
    # read the list of photos to read
    photo_list = pd.read_csv(open(list_file,'r'), encoding='utf-8', engine='c')['photo_id']
    # accumulate the extracted featues in {photo_id: [..features..]} pairs
    features = dict()
    i = 0
    starttime = time.time()
    for photo in photo_list:
        if i % 500 == 0:
            print("Processed %d photos in %f seconds" % (i, starttime - time.time()))
        i += 1
        # crate a file name
        fname = "../data/yelp_photos/photos/%s.jpg" % (photo)
        # open the photo
        try: 
            img = load_img(fname, target_size = (224,224))
            # preprocess to be compatible with VGG16 network
            img = img_to_array(img)
            img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            img = preprocess_input(img)
            # run the abridged VGG16 model to extract the features
            feat = model.predict(img, verbose=0)
            # add to feature dictionary
            features[photo] = feat
        except:
            continue
    return features

## 3. Extract training, validation, and playground features

In [6]:
train_features = extract_photo_features("../data/split_lists/train_ids.csv")

Processed 0 photos in -0.001062 seconds
Processed 500 photos in -274.687827 seconds
Processed 1000 photos in -549.285257 seconds
Processed 1500 photos in -823.234057 seconds
Processed 2000 photos in -1097.450804 seconds
Processed 2500 photos in -1372.238161 seconds
Processed 3000 photos in -1648.023965 seconds
Processed 3500 photos in -1926.930349 seconds
Processed 4000 photos in -2206.056092 seconds
Processed 4500 photos in -2485.632816 seconds
Processed 5000 photos in -2765.037676 seconds
Processed 5500 photos in -3044.301722 seconds
Processed 6000 photos in -3318.817983 seconds
Processed 6500 photos in -3593.101361 seconds
Processed 7000 photos in -3868.167157 seconds
Processed 7500 photos in -4142.990487 seconds
Processed 8000 photos in -4417.113142 seconds
Processed 8500 photos in -4691.361947 seconds
Processed 9000 photos in -4966.055367 seconds
Processed 9500 photos in -5240.475892 seconds
Processed 10000 photos in -5514.582549 seconds
Processed 10500 photos in -5788.835517 seco

In [5]:
valid_features = extract_photo_features("../data/split_lists/valid_ids.csv")

Processed 0 photos in -0.000293 seconds
Processed 500 photos in -288.742708 seconds
Processed 1000 photos in -574.111870 seconds
Processed 1500 photos in -860.867980 seconds
Processed 2000 photos in -1145.466990 seconds
Processed 2500 photos in -1428.325070 seconds
Processed 3000 photos in -1712.472551 seconds
Processed 3500 photos in -1996.174122 seconds
Processed 4000 photos in -2280.400090 seconds
Processed 4500 photos in -2573.160216 seconds
Processed 5000 photos in -2855.648873 seconds
Processed 5500 photos in -3138.568023 seconds
Processed 6000 photos in -3421.359722 seconds
Processed 6500 photos in -3705.590149 seconds
Processed 7000 photos in -3988.211679 seconds
Processed 7500 photos in -4271.022012 seconds
Processed 8000 photos in -4555.384963 seconds
Processed 8500 photos in -4839.588871 seconds
Processed 9000 photos in -5122.717281 seconds
Processed 9500 photos in -5405.252099 seconds
Processed 10000 photos in -5689.548842 seconds
Processed 10500 photos in -5971.535603 seco

In [11]:
mkdir ../data/features

In [13]:
with open("../data/features/train_features.pkl", "wb") as handle:
    pickle.dump(train_features, handle)

In [14]:
with open("../data/features/valid_features.pkl", "wb") as handle:
    pickle.dump(valid_features,handle)

In [16]:
len(valid_features)

20162

In [15]:
len(train_features)

80645

In [21]:
to_categorical(range(10), 1000).nbytes

80000

In [24]:
csr_matrix(to_categorical(range(10), 1000)).nbytes

AttributeError: nbytes not found