Feature Extraction
===============

This code will use Tensorflow/Keras to extract VGG16-FC2 features from the image datasets and save them.

In [None]:
from tensorflow.keras.applications.vgg16 import (VGG16, preprocess_input)
from tensorflow.keras.models import Model

In [None]:
vgg16 = VGG16()

extractor = Model(
  inputs = vgg16.input,
  outputs = vgg16.get_layer("fc2").output
)

In [None]:
def preprocess(fp):
    img = Image.open(fp)
    img = img.resize((224, 224))
    img = img.convert('RGB')
    img = np.array(img)
    img = np.expand_dims(img, 0)
    img = preprocess_input(img)
    return img

In [None]:
import tarfile
from PIL import Image
import numpy as np
import pandas as pd
import sys, os, glob
import tempfile, shutil
from tqdm.notebook import tqdm
from multiprocessing import Pool

### Extract Target Set
This code will process the image archives from the target set. If you have not done so already, open and run the cells in `01_download.ipynb` that download the target set.

**Note**: There are 16 archives in total; processing each one may take up to an hour.

In [None]:
# this parameter controls the number of images loaded into memory at once.
# lower it if you are having problems.
split_size = 5000

archive_paths = sorted(glob.glob('/tf/open_images/targets/images/train_*.tar.gz'))
pool = Pool()

try:
    if not os.path.exists('/tf/open_images/targets/features/'):
        os.mkdir('/tf/open_images/targets/features/')

    for archive_path in tqdm(archive_paths, desc='all files'):
        ar_name = os.path.basename(archive_path).replace('.tar.gz', '')
        feat_path = '/tf/open_images/targets/features/%s.h5' % ar_name.replace('train', 'targets')

        if os.path.exists(feat_path):
            continue

        feature_store = pd.HDFStore(feat_path, complevel=9, mode='w')

        tempdir = tempfile.mkdtemp()
        archive = tarfile.open(archive_path)
        archive.extractall(tempdir)

        image_paths = sorted(glob.glob(os.path.join(tempdir, ar_name, "*.jpg")))
        splits = np.array_split(np.array(image_paths), len(image_paths)/split_size)

        for split in tqdm(splits, desc=ar_name):
            image_ids = [ os.path.basename(path).replace('.jpg', '') for path in split ]

            image_array = np.concatenate(pool.map(preprocess, split))

            features = extractor.predict(image_array)
            del image_array

            frame = pd.DataFrame(features, index=image_ids)
            del features

            feature_store.append('df', frame)

        shutil.rmtree(tempdir)
        feature_store.close()

finally:
    pool.close()

### Extract Validation Set

In [None]:
split_size = 5000
pool = Pool()

try:
    if not os.path.exists('/tf/open_images/validation/features/'):
        os.mkdir('/tf/open_images/validation/features/')

    archive_path = '/tf/open_images/validation/images/validation.tar.gz'
    ar_name = 'validation'
    feat_path = '/tf/open_images/validation/features/validation.h5'
    
    if not os.path.exists(feat_path):

        feature_store = pd.HDFStore(feat_path, complevel=9, mode='w')

        tempdir = tempfile.mkdtemp()
        archive = tarfile.open(archive_path)
        archive.extractall(tempdir)

        image_paths = sorted(glob.glob(os.path.join(tempdir, ar_name, "*.jpg")))
        splits = np.array_split(np.array(image_paths), len(image_paths)/split_size)

        for split in tqdm(splits, desc=ar_name):
            image_ids = [ os.path.basename(path).replace('.jpg', '') for path in split ]

            image_array = np.concatenate(pool.map(preprocess, split))

            features = extractor.predict(image_array)
            del image_array

            frame = pd.DataFrame(features, index=image_ids)
            del features

            feature_store.append('df', frame)

        shutil.rmtree(tempdir)
        feature_store.close()

finally:
    pool.close()

### Extract Training Set
This code will process the images from the encoder training set. If you have not done so already, open and run the cells in `01_download.ipynb` that download the training set.

In [None]:
if not os.path.exists('/tf/open_images/train/features'):
    os.mkdir('/tf/open_images/train/features')

prefixes = [ "%x"%i for i in range(16) ]

pool = Pool()
try:
    for prefix in tqdm(prefixes, desc='all files'):

        feat_path = '/tf/open_images/train/features/train_%s.h5' % prefix

        if os.path.exists(feat_path):
            continue

        feature_store = pd.HDFStore(feat_path, complevel=9, mode='w')

        image_dirs = sorted(glob.glob('/tf/open_images/train/images/%s*' % prefix ))

        for image_dir in tqdm(image_dirs, desc="prefix %s" % prefix):

            image_paths = sorted(glob.glob(os.path.join(image_dir, '*.jpg')))

            image_ids = [ os.path.basename(path).replace('.jpg', '') for path in image_paths ]

            image_array = np.concatenate(pool.map(preprocess, image_paths))

            features = extractor.predict(image_array)
            del image_array

            frame = pd.DataFrame(features, index=image_ids)
            del features

            feature_store.append('df', frame)

        feature_store.close()
        
finally:
    pool.close()

### Extract Extended Target Set
This code will process the images from the extended target set. If you have not done so already, open and run the cells in `01_download.ipynb` that download the extended target set.

In [None]:
if not os.path.exists('/tf/open_images/extended_targets/features'):
    os.mkdir('/tf/open_images/extended_targets/features')

prefixes = [ "%x"%i for i in range(16) ]

pool = Pool()
try:
    for prefix in tqdm(prefixes, desc='all files'):

        feat_path = '/tf/open_images/extended_targets/features/extended_targets_%s.h5' % prefix

        if os.path.exists(feat_path):
            continue

        feature_store = pd.HDFStore(feat_path, complevel=9, mode='w')

        image_dirs = sorted(glob.glob('/tf/open_images/extended_targets/images/%s*' % prefix ))

        for image_dir in tqdm(image_dirs, desc="prefix %s" % prefix):

            image_paths = sorted(glob.glob(os.path.join(image_dir, '*.jpg')))

            image_ids = [ os.path.basename(path).replace('.jpg', '') for path in image_paths ]

            image_array = np.concatenate(pool.map(preprocess, image_paths))

            features = extractor.predict(image_array)
            del image_array

            frame = pd.DataFrame(features, index=image_ids)
            del features

            feature_store.append('df', frame)

        feature_store.close()
        
finally:
    pool.close()

### Extract Query Set
This code will process the images from the query set. You may use the included query images, or add your own `.jpg` images to the `data/queries/images` folder.

In [None]:
query_image_paths = glob.glob('/tf/primo/data/queries/images/*.jpg')
query_ids = [os.path.basename(path).replace('.jpg','') for path in query_image_paths]

query_image_array = np.concatenate([preprocess(image_path) for image_path in query_image_paths])
query_features = extractor.predict(query_image_array)

pd.DataFrame(query_features, index=query_ids).to_hdf('/tf/primo/data/queries/features.h5', key='df', mode='w')