# Extract features from specific class in imageset
To prepare our data, we'll be following what is loosely known as an `ETL` process.

- `E`xtract data from a data source.
- `T`ransform data into a desirable format. (Put it into `tensor` form)
- `L`oad data into a suitable structure. (Put data into an `object` to make it easily accessible.)

In [1]:
import torch
from torch.utils.data import DataLoader, Subset
from torchvision import models, transforms, datasets
from tqdm import tqdm, tqdm_notebook
import numpy as np
import pickle

## Create the database

Here we will make a slight modification to the ImageFolder class to retrieve the name of the file as well. Hence we will inherit the ImageFolder class.

In [2]:
class ImageFolderWithPaths(datasets.ImageFolder):
  """Custom dataset that includes image file paths. Extends
  torchvision.datasets.ImageFolder
  Source: https://gist.github.com/andrewjong/6b02ff237533b3b2c554701fb53d5c4d
  """
  
  # override the __getitem__ method. this is the method that dataloader calls
  def __getitem__(self, index):
    # this is what ImageFolder normally returns 
    original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
    # the image file path
    path = self.imgs[index][0]
    # make a new tuple that includes original and the path
    tuple_with_path = (original_tuple + (path,))
    return tuple_with_path

Preprocessing the images and setting up dataloaders

In [3]:
# [TODO]: Set whether running locally or on google drive
googleDrive = False

if googleDrive:
  from google.colab import drive
  drive.mount('/content/drive')
  data_directory = '/content/drive/MyDrive/Musashi Images/lobe/'

else:
  # [TODO]: set the directory path once the SSD is loaded
  data_directory = "/home/umar-musashi/Documents/Musashi Part Images/lobe"

# transform = transforms.Compose(
#             [transforms.Resize(256),
#             transforms.CenterCrop(224),
#              transforms.RandomHorizontalFlip(),
#              transforms.ToTensor(),
#              transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                                   std=[0.229, 0.224, 0.225]),
#              ])

transform = transforms.Compose([
    transforms.Resize(size=[224, 224], interpolation=2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
])
dataset = ImageFolderWithPaths(data_directory, transform=transform) # our custom dataset

# [TODO]: Random split your dataset into train and test if using it
# train_set_size = int(len(dataset) * 0.8)
# test_set_size = len(dataset) - train_set_size
# train_set, test_set = torch.utils.data.random_split(dataset, [train_set_size, test_set_size])
# print(f"[INFO] Length of train_set: {len(train_set)}")
# print(f"[INFO] Length of test_set: {len(test_set)}")
# print(f"[INFO] Total number of images: {len(test_set)+len(train_set)}\n")
# train_img_paths = [pair[1] for pair in enumerate(dataset.imgs) if pair[0] in train_set.indices]
# test_img_paths = [pair[1] for pair in enumerate(dataset.imgs) if pair[0] in test_set.indices]

# [TODO]: Uncomment if a specific class (folder) is wanted
# desired_class = 'lobe_shadow_flat'
# class_idx = dataset.class_to_idx[desired_class]

# targets = torch.tensor(dataset.targets)
# target_idx = (targets==class_idx).nonzero()
# start = target_idx[0].item()
# end = target_idx[-1].item()
# desired_dataset = Subset(dataset, range(start,end))
# # Keep the image paths of each set in separate variables
# desired_dataset_paths = [pair[1] for pair in enumerate(dataset.imgs) if pair[0] in desired_dataset.indices]

# # initialize the dataloaders
# dataloader = DataLoader(desired_dataset, num_workers=2)


# [TODO]: Uncomment if all folders wanted
# Keep the image paths of each set in separate variables
desired_class = data_directory.split('/')[-1]
desired_dataset_paths = dataset.imgs

# initialize the dataloaders
dataloader = DataLoader(dataset, num_workers=2)

In [None]:
desired_dataset_paths = [dataset.imgs[i][0] for i in range(len(desired_dataset_paths))]
desired_dataset_paths

In [15]:
dataset.classes

['lobe_por_dirt',
 'lobe_shadow',
 'lobe_shadow_flat',
 'lobe_side_nonfinished',
 'lobe_sides']

## Download a pretrained model and extract features

We will use the output of the last pooling layers to create the image descriptors

In [16]:
def pooling_output(x):
  global model
  for layer_name, layer in model._modules.items():
    x = layer(x)
    if layer_name == 'avgpool':
      break
  return x

In [10]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using Device:", DEVICE)
model = models.resnet50(pretrained=True)

Using Device: cuda


In [18]:
features = []

model.to(DEVICE)
with torch.no_grad():
  model.eval()
  for inputs, labels, paths in tqdm(dataloader): 
    result = pooling_output(inputs.to(DEVICE))
    features.append(result.cpu().view(1, -1).numpy())
    torch.cuda.empty_cache()

100%|██████████| 5921/5921 [00:39<00:00, 150.23it/s]


# Save extracted features and image paths

In [19]:
features = np.vstack(features)
saveDirectory = "/home/umar-musashi/Documents/repos/Content-Based-Image-Retrieval/resources/LobeFeatures"
pickle.dump(features, open(saveDirectory+f"/{desired_class}-features.pickle", 'wb'))
pickle.dump(desired_dataset_paths, open(saveDirectory+f"/{desired_class}-img-paths.pickle", 'wb'))

In [None]:
'''
Coding Plan:
1 - Set the dataset directory and load your caltech101 data using pytorch
2 - Use any model and extract the features
3 - Save the features to a pickle variable in the features folder 

Current Problems:
- FEB-1-2022: Didnt extract features because i realised that the lobe data was
  all loaded into variables and not properly separated by folder name, need to 
  change that before i extract features, since we only want to cluster within
  a specific folder of the lobe data
  - Solution:
    - Creating subsets of each folder first then passing to Dataloader
  
when done:
- add functionality to choose from a list of models for extraction
- let user specify which class they want to extract from
- add another Dataset class that can handle only a single folder aswell
- try extracting features from different layers https://towardsdatascience.com/image-feature-extraction-using-pytorch-e3b327c3607a 
'''