# Part C: Extract Features

### Appraoch:
    1) RESNET features are extracted for each generated and preprocessed image in train and test file
    2) The extracted features are saved to avoid extracting again (in case its required)

In [1]:
import os
import pickle
import pandas as pd
import numpy as np

from keras.applications import resnet50
from keras.preprocessing import image

from PIL import Image

In [2]:
df_train = pd.read_csv("train_file.csv")
df_test = pd.read_csv("test_file.csv")

In [3]:
df_train['label'].value_counts()

Roboto           2016
Arimo            1680
OpenSans         1680
Ubuntu           1344
Oswald           1176
DancingScript     840
PTSerif           672
NotoSans          672
PatuaOne          168
FredokaOne        168
Name: label, dtype: int64

In [4]:
df_test['label'].value_counts()

Roboto           864
Arimo            720
OpenSans         720
Ubuntu           576
Oswald           504
DancingScript    360
PTSerif          288
NotoSans         288
FredokaOne        72
PatuaOne          72
Name: label, dtype: int64

In [5]:
def save_restnet_features(filepaths):
    """
    For each image in the filepaths list, extract and return RESNET50 feature
    
    """

    # Load all images
    image_vectors = []
    for filepath in filepaths:
        with Image.open(filepath) as image_vector:
            image_vectors.append(image_vector.copy())

    # Preprocessing for Resnet50
    image_arrays = [image.img_to_array(vector) for vector in image_vectors]
    processed_arrays = resnet50.preprocess_input(np.array(image_arrays))

    # Load pretrained model (Max pooling is used as the original features are 7 x 7 x 2048)
    model = resnet50.ResNet50(weights='imagenet', include_top=False, pooling='max')
    features = model.predict(processed_arrays)

    # Flatten features
    features = [x.reshape(1, 2048)[0] for x in features]
    
    # Save all features
    for filepath, feature in zip(filepaths, features):
        save_filepath = 'data/extracted_features/{}'.format(filepath.split('/')[-1].replace('.png', 'pkl'))
        with open(save_filepath, 'wb') as save_filepath:
            pickle.dump(feature, save_filepath, protocol=4)

    return features


def extract_features(filepaths, chunk_size=1000):
    """
    Run function "save_restnet_features" in chunks 
     - To tackle memory constraints by avoid loading all images/ features in the memory at once
    
    """
    
    os.makedirs('data/extracted_features',  exist_ok=True)
    
    for start in range(0, len(filepaths), chunk_size):
        
        print ("Extracting Features for batch:", start/chunk_size)
        save_restnet_features(filepaths[start:start + chunk_size])

In [6]:
%%time

extract_features(list(df_train['file_path_preprocessed']))

Extracting Features for batch: 0.0
Extracting Features for batch: 1.0
Extracting Features for batch: 2.0
Extracting Features for batch: 3.0
Extracting Features for batch: 4.0
Extracting Features for batch: 5.0
Extracting Features for batch: 6.0
Extracting Features for batch: 7.0
Extracting Features for batch: 8.0
Extracting Features for batch: 9.0
Extracting Features for batch: 10.0
CPU times: user 1h 5min 3s, sys: 3min 6s, total: 1h 8min 9s
Wall time: 9min 50s


In [7]:
%%time

extract_features(list(df_test['file_path_preprocessed']))

Extracting Features for batch: 0.0
Extracting Features for batch: 1.0
Extracting Features for batch: 2.0
Extracting Features for batch: 3.0
Extracting Features for batch: 4.0
CPU times: user 28min 3s, sys: 1min 21s, total: 29min 25s
Wall time: 4min 13s
