# Notebook to extract features with a semantic CNN trained on the ImageNet dataset
***
## Imports

In [2]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from ConvFuncs import HDF5DatasetWriter
import numpy as np
import progressbar
import random
import os
import jobsConfig as config
import h5py
from preprocessors import AspectAwarePreprocessor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import joblib

## Initializations 

In [3]:
train_database = h5py.File(config.TRAIN_HDF5, "r")
val_database = h5py.File(config.VAL_HDF5, "r")
train_output = "../data/hdf5/train_ResNet50.hdf5"
val_output = "../data/hdf5/val_ResNet50.hdf5"
batch_size = 16
buffer_size = 1000

#Get data
X_train = train_database["images"]
y_train = train_database["labels"]
X_val = val_database["images"]
y_val = val_database["labels"]

In [7]:
len(y_train[:][y_train[:] == 0])

833

## Start with ResNet50 just like Adrian 

In [3]:
RN50 = ResNet50(weights = "imagenet", include_top = False)

In [4]:
#Look at layers
def inspect_model(model):
    for i, layer in enumerate(model.layers):
        print("{}\t{}\t{}".format(i, layer.output_shape, layer.__class__.__name__))

In [5]:
inspect_model(RN50)

0	[(None, None, None, 3)]	InputLayer
1	(None, None, None, 3)	ZeroPadding2D
2	(None, None, None, 64)	Conv2D
3	(None, None, None, 64)	BatchNormalization
4	(None, None, None, 64)	Activation
5	(None, None, None, 64)	ZeroPadding2D
6	(None, None, None, 64)	MaxPooling2D
7	(None, None, None, 64)	Conv2D
8	(None, None, None, 64)	BatchNormalization
9	(None, None, None, 64)	Activation
10	(None, None, None, 64)	Conv2D
11	(None, None, None, 64)	BatchNormalization
12	(None, None, None, 64)	Activation
13	(None, None, None, 256)	Conv2D
14	(None, None, None, 256)	Conv2D
15	(None, None, None, 256)	BatchNormalization
16	(None, None, None, 256)	BatchNormalization
17	(None, None, None, 256)	Add
18	(None, None, None, 256)	Activation
19	(None, None, None, 64)	Conv2D
20	(None, None, None, 64)	BatchNormalization
21	(None, None, None, 64)	Activation
22	(None, None, None, 64)	Conv2D
23	(None, None, None, 64)	BatchNormalization
24	(None, None, None, 64)	Activation
25	(None, None, None, 256)	Conv2D
26	(None, None, 

**No output layers on this network. Get a baseline by using this network as a feature extractor by dropping final FC layers.**

In [6]:
train_writer = HDF5DatasetWriter(train_output, (len(y_train), 100352), dataKey = "features", bufSize = buffer_size)

The supplied database ../data/hdf5/train_ResNet50.hdf5 already exists. Would you like to delete the database and proceede (y/n)?:  y


In [16]:
def extract_features(X, y, writer, model, batch_size, num_instances):
    #First initialize progressbar and preprocessor
    widgets = ["Extracting Features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval = num_instances, widgets = widgets).start()
    
    aap = AspectAwarePreprocessor(224, 224)
    
    #Loop through images in batches
    for i in np.arange(0, num_instances, batch_size):
        batch_images = X[i:i+batch_size]
        batch_labels = y[i:i+batch_size]
        processed_images = []
        
        #Preprocess each image
        for j, image in enumerate(batch_images):
            
            #Ensure image is a keras compatible array
            image = aap.preprocess(image)
            image = img_to_array(image)
            
            #Preprocess image
            image = np.expand_dims(image, axis = 0)
            image = imagenet_utils.preprocess_input(image)
            
            #Replace image
            processed_images.append(image)
            
        #Extract features and flatten
        processed_images = np.vstack(processed_images)
        features = model.predict(processed_images, batch_size = batch_size)
        features = features.reshape((features.shape[0],100352))
        
        #Add features and update progressbar
        writer.add(features, batch_labels)
        pbar.update(i)
        
    #Close database and finish progressbar
    writer.close()
    pbar.finish()

In [17]:
extract_features(X_train, y_train, train_writer, RN50, batch_size, len(y_train))

Extracting Features: 100% |#####################################| Time: 0:02:40


In [18]:
val_writer = HDF5DatasetWriter(val_output, (len(y_val), 100352), dataKey = "features", bufSize = buffer_size)

In [19]:
extract_features(X_val, y_val, val_writer, RN50, batch_size, len(y_val))

Extracting Features: 100% |#####################################| Time: 0:00:51


***
## Train logistic regression model on features extracted by ResNet50

In [21]:
#Read in datasets
train = h5py.File("../data/hdf5/train_ResNet50.hdf5", "r")
val = h5py.File("../data/hdf5/val_ResNet50.hdf5", "r")

In [28]:
#Grid search for best logistic regressor
log_reg = LogisticRegression(random_state = 42, verbose = 1)
grid = {"C": [0.0001, 0.001, 0.01, 1, 10], "max_iter":[100,500]}
log_grid = GridSearchCV(log_reg, grid, cv = 5)
log_grid.fit(train["features"][:], train["labels"][:])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=42, verbose=1),
             param_grid={'C': [0.0001, 0.001, 0.01, 1, 10],
                         'max_iter': [100, 500]})

In [31]:
log_reg_best = log_grid.best_estimator_
preds = log_reg_best.predict(val["features"][:])
score = accuracy_score(val["labels"][:],preds)
score

0.9963963963963964

In [33]:
joblib.dump(log_reg_best, "output/colorstrom.pkl")

['output/colorstrom.pkl']

In [35]:
log_grid.best_params_

{'C': 0.01, 'max_iter': 100}