<a href="https://colab.research.google.com/github/vinayakShenoy/DL4CV/blob/master/Code/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature extraction process
- Python script that can be used to extract features from an arbitrary image dataset.

In [None]:
!pip install imutils
!pip install import_ipynb
!pip install progressbar
!git clone http://github.com/vinayakShenoy/DL4CV
%cd DL4CV/Code

In [23]:
!kaggle datasets download -d ashishsaxena2209/animal-image-datasetdog-cat-and-panda
!kaggle datasets download -d athota1/caltech101

Downloading animal-image-datasetdog-cat-and-panda.zip to /content/DL4CV/Code
 98% 367M/376M [00:04<00:00, 89.4MB/s]
100% 376M/376M [00:04<00:00, 85.9MB/s]


In [None]:
!unzip animal*
!unzip calte*

In [40]:
!ls animals/animals|wc -l 
!rm -rf 101_Ob*/BACK*
!ls 101_ObjectCategories|wc -l

3
101


In [3]:
from tensorflow.keras.applications import VGG16, imagenet_utils
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from sklearn.preprocessing import LabelEncoder
import numpy as np
import imutils
import numpy as np
import random
import os
import progressbar
import import_ipynb
from pyimage.io.hdf5py import HDF5DatasetWriter
from imutils import paths

importing Jupyter notebook from /content/DL4CV/Code/pyimage/io/hdf5py.ipynb


In [4]:
def extract_features(dataset, output, batch_size=32, buffer_size=1000):
  args = {
      "dataset":dataset,
      "output": output,
      "batch-size": batch_size,
      "buffer-size": buffer_size
  }

  bs = args["batch-size"]

  # grab the list of images that we'll be describing then randomly
  # shuffle them to allow for easy training and testing splits via
  # array slicing during training time
  print("INFO loading images")
  imagePaths = list(paths.list_images(args["dataset"]))
  random.shuffle(imagePaths)

  # extract the class labels from the image paths then encode the
  # labels
  labels = [p.split(os.path.sep)[-2] for p in imagePaths]
  le = LabelEncoder()
  labels = le.fit_transform(labels)

  # load vgg16 network
  print("INFO loading network")
  model = VGG16(weights="imagenet", include_top=False)

  # init hdf5, and store class label names in dataset
  dataset = HDF5DatasetWriter((len(imagePaths), 512*7*7),
                              args["output"], dataKey="features", bufSize=args["buffer-size"])
  dataset.storeClassLabels(le.classes_)

  widgets = ["extracting features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
  pbar = progressbar.ProgressBar(maxval=len(imagePaths), widgets=widgets).start()

  for i in np.arange(0, len(imagePaths), bs):
    # extract the batch of images and labels, then init the list of actual images that
    # will be passed through the network for feature extraction
    batchPaths = imagePaths[i:i+bs]
    batchLabels = labels[i:i+bs]
    batchImages = []

    for (j, imagePath) in enumerate(batchPaths):
      # load the input image and resize to vgg16 input size
      image = load_img(imagePath, target_size=(224,224))
      image = img_to_array(image)

      # preprocess image by expanding dimensions and 
      # subtracting mean RGB from dataset
      image = np.expand_dims(image, axis=0)
      image = imagenet_utils.preprocess_input(image)

      # add image to batch
      batchImages.append(image)
    
    # pass the images through the network and use the outputs as our actual features
    batchImages = np.vstack(batchImages)
    features = model.predict(batchImages, batch_size=bs)

    # reshape the features so that each image is represented by a flattened 
    # feature vector of the MaxPooling2D outputs
    features = features.reshape((features.shape[0], 512*7*7))

    # add features and labels to HDF5 dataset
    dataset.add(features, batchLabels)
    pbar.update(i)

  dataset.close()
  pbar.finish()

In [6]:
# Extracting features from animals dataset
extract_features("animals/animals", "features_animals.hdf5")

INFO loading images
INFO loading network





In [8]:
# Extracting features from caltech-101 dataset
extract_features("101_ObjectCategories", "features_caltech101.hdf5")

INFO loading images
INFO loading network





In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle
import h5py

In [12]:
# db: path to hdf5 dataset containing extracted features
# model: path to our output logistic regression classifier
# jobs: specify number of concurrent jobs when running a grid search to tune hyperparamaters.
def train_model(db, model=None, jobs=1):
  db = h5py.File(db, "r")

  #to initializer train-test split
  i = int(db["labels"].shape[0]*0.75)

  # define set of parameters that we want to tune then start a grid search where
  # we evaluate our model for each value of C
  print("INFO tuning hyperparameters")
  params = {"C":[0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}
  model = GridSearchCV(LogisticRegression(solver="lbfgs", 
                                          multi_class="auto"), params, cv=3, n_jobs=jobs)
  model.fit(db["features"][:i], db["labels"][:i])
  print("INFO best hyperparameters: {}".format(model.best_params_))

  # evaluate model
  preds = model.predict(db["features"][i:])
  print(classification_report(db["labels"][i:], preds, target_names=db["label_names"]))

  # serializer the model to disk
  f = open(args["model"])

In [13]:
train_model("features_animals.hdf5")

INFO tuning hyperparameters
INFO best hyperparameters: {'C': 10000.0}
              precision    recall  f1-score   support

        cats       0.96      1.00      0.98       244
        dogs       1.00      0.96      0.98       267
       panda       1.00      1.00      1.00       239

    accuracy                           0.99       750
   macro avg       0.99      0.99      0.99       750
weighted avg       0.99      0.99      0.99       750



In [14]:
train_model("features_caltech101.hdf5")

INFO tuning hyperparameters


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

INFO best hyperparameters: {'C': 1.0}
                 precision    recall  f1-score   support

          Faces       0.98      1.00      0.99       111
     Faces_easy       1.00      0.98      0.99       120
       Leopards       0.96      1.00      0.98        50
     Motorbikes       1.00      0.99      1.00       200
      accordion       1.00      1.00      1.00        18
      airplanes       1.00      0.99      1.00       197
         anchor       1.00      0.90      0.95        10
            ant       0.83      0.71      0.77         7
         barrel       1.00      1.00      1.00        14
           bass       0.78      0.82      0.80        17
         beaver       0.80      0.73      0.76        11
      binocular       1.00      0.75      0.86         8
         bonsai       0.94      0.92      0.93        37
          brain       0.96      0.92      0.94        26
   brontosaurus       0.92      0.86      0.89        14
         buddha       0.95      1.00      0.97   